## Importing Required libraries

pip install pandas scikit-learn nltk

***

In [5]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shake\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Setting up StopWords, Lemmatizer and Tokenizer

***

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(lemmatized_tokens)

## Function for Preprocessing Text

***

In [7]:
def preprocess_text(text):
    #lowercasing
    text = text.lower()

    #removing @USER mentions
    text = re.sub(r'@user', '', text)
    
    #removing symbols
    text = re.sub(r'[^\w\s]', ' ', text)
    
    preprocessed_text = text
    return preprocessed_text

## Reading and Preprocessing Training Data

***

In [8]:
train_data = pd.read_csv('training_data.csv')

train_data = train_data.drop('id', axis=1)

train_data['category'].fillna('NULL', inplace=True)

train_data.dropna(subset=['tweet'], inplace=True)

train_data['tweet'] = train_data['tweet'].apply(preprocess_text)
train_data['tweet'] = train_data['tweet'].apply(tokenize_and_lemmatize)

train_data['offensive'] = train_data['offensive'].apply(lambda x: 1 if x == 'OFF' else 0)

print("Preprocessed training data:")
print(train_data.head())


Preprocessed training data:
                                               tweet  offensive category
0                           ask native american take          1      UNT
1                   go home drunk maga trump2020 url          1      TIN
2  amazon investigating chinese employee selling ...          0     NULL
3                 someone vetaken piece shit volcano          1      UNT
4   obama wanted liberal amp illegals move red state          0     NULL


## Reading and Preprocessing Testing Data amd Combining with Category Data

***

In [9]:
test_data = pd.read_csv('testing_data.csv')  

testingCat_data = pd.read_csv("testingCategory_data.csv")

all_ids = testingCat_data['id'].values.flatten().tolist()
all_categories = testingCat_data['category'].values.flatten().tolist()


id_to_category = dict(zip(all_ids, all_categories))

test_data['category'] = None

for index, row in test_data.iterrows():
    id_value = row['id']
    if id_value in id_to_category:
        test_data.at[index, 'category'] = id_to_category[id_value]
    else:
        test_data.at[index, 'category'] = "NULL"

test_data = test_data.drop('id', axis=1)

test_data.dropna(subset=['tweet'], inplace=True)

test_data['tweet'] = test_data['tweet'].apply(preprocess_text)
test_data['tweet'] = test_data['tweet'].apply(tokenize_and_lemmatize)

test_data['offensive'] = test_data['offensive'].apply(lambda x: 1 if x == 'OFF' else 0)

print("\nPreprocessed testing data:")
print(test_data.head())


Preprocessed testing data:
                                               tweet  offensive category
0  whoisq wherestheserver dumpnike declasfisa dem...          1      TIN
1  constitutionday revered conservative hated pro...          0     NULL
2  foxnews nra maga potus trump 2ndamendment rnc ...          0     NULL
3  watching boomer getting news still parole alwa...          0     NULL
4  nopasaran unity demo oppose far right london a...          1      TIN


# Splitting Data, Setting Up The TF-IDF and Setting Up The Logistic Regression Model

***

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['tweet'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['tweet'])


model_offensive = LogisticRegression(max_iter=1000)


model_category = LogisticRegression(max_iter=1000)


## Training and Testing The Model On Test Data

***

In [11]:
# Train for offensive detection
model_offensive.fit(X_train_tfidf, train_data['offensive'])
y_pred_test_offensive = model_offensive.predict(X_test_tfidf)

# Train for category prediction
model_category.fit(X_train_tfidf, train_data['category'])
y_pred_test_category = model_category.predict(X_test_tfidf)


# Generating Report

***

In [12]:

print("Classification Report For Detecting Offensive Tweets On Test Data:\n")
print(classification_report(test_data['offensive'], y_pred_test_offensive))

conf_matrix_test_offensive = confusion_matrix(test_data['offensive'], y_pred_test_offensive)
print("Confusion Matrix for Offensive Detection on Test Data:")
print(conf_matrix_test_offensive)

print("<------------------------------------------------------>\n")

# Calculate the classification report for category prediction on test data
classification_report_test_category = classification_report(test_data['category'], y_pred_test_category)
print("Classification Report for Category Prediction on Test Data:")
print(classification_report_test_category)

# Calculate the confusion matrix for category prediction on test data
conf_matrix_test_category = confusion_matrix(test_data['category'], y_pred_test_category)
print("Confusion Matrix for Category Prediction on Test Data:")
print(conf_matrix_test_category)


Classification Report For Detecting Offensive Tweets On Test Data:

              precision    recall  f1-score   support

           0       0.80      0.98      0.88       620
           1       0.87      0.36      0.51       240

    accuracy                           0.81       860
   macro avg       0.83      0.67      0.70       860
weighted avg       0.82      0.81      0.78       860

Confusion Matrix for Offensive Detection on Test Data:
[[607  13]
 [153  87]]
<------------------------------------------------------>

Classification Report for Category Prediction on Test Data:
              precision    recall  f1-score   support

        NULL       0.80      0.97      0.88       620
         TIN       0.69      0.35      0.47       213
         UNT       0.50      0.04      0.07        27

    accuracy                           0.79       860
   macro avg       0.66      0.45      0.47       860
weighted avg       0.77      0.79      0.75       860

Confusion Matrix for Categor

## Generating Sample of Tweets

***

In [13]:
# List of sample tweets for prediction
testing_tweets = [
    "@User Thanks so much for your help! You're amazing!",
    "Just had the best time at the park with friends. Feeling grateful!",
    "Huge shoutout to @User for always being there to support me. You're the best!",
    "Loving this sunny weather ☀️ Ready for a productive day!",
    "Spent the day reading a great book. It's so nice to unwind and relax.",
    "@USER 7 fucking years.",
    "@USER you're ugly.",
    "@USER just kill yourself man",
    "bruh yeah this shit is really ugly @USER.",
    "really you ugly cunt?",
    "Damn, you're a pig.",
    "Why do people still fall for this shit?",
    "Are you kidding with me?",
    "@USER Can't believe I've waited for 3 fucking years for this.",
    "Just got stuck in shit traffic again. What a day!",
    "I've been dealing with this idiot all day. Can't catch a break.",
    "@USER Seriously? This is the fucked up shit they come up with?",
    "the hell?"
]

## Testing on Tweet Samples

***

In [14]:
list_of_NotOffensive = []
list_of_OffensiveTIN = []
list_of_OffensiveUNT = []


for i, tweet in enumerate(testing_tweets):
    preprocessed_tweet = preprocess_text(tweet)
    lemmatized_tweet = tokenize_and_lemmatize(preprocessed_tweet)
    tweet_tfidf = tfidf_vectorizer.transform([lemmatized_tweet])
    prediction_offensive = model_offensive.predict(tweet_tfidf)
    prediction_category = model_category.predict(tweet_tfidf)
    
    if (prediction_offensive[0] == 0):
        list_of_NotOffensive.append(tweet)
    else:
        if (prediction_category[0] == "UNT"):
            list_of_OffensiveUNT.append(tweet)
        else:
            list_of_OffensiveTIN.append(tweet)

print("----------------PREDICTION ON SAMPLE TWEETS-------------------")
print("Tweets are that predicted to be not offensive")
print("--------------------------------------------------------")
for i in list_of_NotOffensive:
    print(" >> ",i)

print("--------------------------------------------------------")
print("Tweets are that predicted to be offensive and Targeted Insults")
print("--------------------------------------------------------")
for i in list_of_OffensiveTIN:
    print(" >> ",i)
print("--------------------------------------------------------")
print("Tweets are that predicted to be offensive and Untargeted Insults")
print("--------------------------------------------------------")
for i in list_of_OffensiveUNT:
    print(" >> ",i)

----------------PREDICTION ON SAMPLE TWEETS-------------------
Tweets are that predicted to be not offensive
--------------------------------------------------------
 >>  @User Thanks so much for your help! You're amazing!
 >>  Just had the best time at the park with friends. Feeling grateful!
 >>  Huge shoutout to @User for always being there to support me. You're the best!
 >>  Loving this sunny weather ☀️ Ready for a productive day!
 >>  Spent the day reading a great book. It's so nice to unwind and relax.
 >>  Are you kidding with me?
--------------------------------------------------------
Tweets are that predicted to be offensive and Targeted Insults
--------------------------------------------------------
 >>  @USER you're ugly.
 >>  @USER just kill yourself man
 >>  bruh yeah this shit is really ugly @USER.
 >>  really you ugly cunt?
 >>  Damn, you're a pig.
 >>  Why do people still fall for this shit?
 >>  @USER Can't believe I've waited for 3 fucking years for this.
 >>  Just