In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
df= pd.read_csv('train_E6oV3lV.csv')
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [3]:
#check for null
df.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [4]:
df['label'].unique()

array([0, 1], dtype=int64)

In [5]:
#note that label 1 means that the tweet is racist/sexist while label 0 means the tweet is not racist/sexist 


In [5]:
#remove special characters that are not # and a-z
df['Text'] = df['tweet'].str.replace('[^a-zA-Z#]', ' ', regex=True)

#remove user 
df['tweet']= df['Text'].str.replace('user','')

df['tweet']


0           when a father is dysfunctional and is so se...
1            thanks for #lyft credit i can t use cause ...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide  society now    #motivation
                               ...                        
31957    ate   isz that youuu                          ...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960      #sikh #temple vandalised in in #calgary  #ws...
31961                         thank you   for you follow  
Name: tweet, Length: 31962, dtype: object

In [6]:
#remove additional spaces 
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'\s+', ' ', x))

#remove words that are less than 3 in length
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if len(word) >= 3]))

#convert all words to lowercase
df['tweet'] = df['tweet'].str.lower()
df['tweet']


0        when father dysfunctional and selfish drags hi...
1        thanks for #lyft credit can use cause they don...
2                                      bihday your majesty
3                       #model love take with all the time
4                       factsguide society now #motivation
                               ...                        
31957                                   ate isz that youuu
31958    see nina turner the airwaves trying wrap herse...
31959      listening sad songs monday morning otw work sad
31960    #sikh #temple vandalised #calgary #wso condemn...
31961                             thank you for you follow
Name: tweet, Length: 31962, dtype: object

In [7]:
df = df.drop(['Text'],axis = 1)
df

Unnamed: 0,id,label,tweet
0,1,0,when father dysfunctional and selfish drags hi...
1,2,0,thanks for #lyft credit can use cause they don...
2,3,0,bihday your majesty
3,4,0,#model love take with all the time
4,5,0,factsguide society now #motivation
...,...,...,...
31957,31958,0,ate isz that youuu
31958,31959,0,see nina turner the airwaves trying wrap herse...
31959,31960,0,listening sad songs monday morning otw work sad
31960,31961,1,#sikh #temple vandalised #calgary #wso condemn...


In [8]:
# calling X and y 
X = df['tweet']
y = df['label']

In [9]:
#testing and training sets 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3,random_state = 42)
print ('The length of training set:',len(X_train))
print ('The length of testing set:',len(X_test))

The length of training set: 22373
The length of testing set: 9589


In [10]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Apply stopword removal to X_train
X_train_cleaned = X_train.apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in stop_words))

# Apply stopword removal to X_test
X_test_cleaned = X_test.apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in stop_words))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
X_train_cleaned

9635     summer time #summeriscoming #swimming #picofth...
2447     dese niggas show dese otha bitches snap twitte...
16134         boost immune system allow bodies use #energy
18393    reading manuscript wanting stop good evening g...
4420                                 baby says hates today
                               ...                        
29802    #waltdisneyreso ashamed knew alligators beach ...
5390     invited catch stop talking much love job #reta...
860      black professor makes assumptions entire race ...
15795    #lgbtqhatetrumppay total #liberal trash amp #p...
23654                    makes people relative way #africa
Name: tweet, Length: 22373, dtype: object

In [12]:
X_test_cleaned

12227    mom says smile captivating says happy sunday p...
14709    days meeting sis law couney bowers first vel l...
19319    hating conservative homophobes using tragedy w...
4308     awee #scream #friday #acewellstucker #cynthiab...
24055    fathersday #father #day #god #tony #smith buy ...
                               ...                        
20593                                #model love take time
11682    afraid done nothing wrong salla pura barbad ka...
10882    weeks till perform showcase #actors #performin...
6084     real #republicans chance take pay back teapubl...
6952     lincoln #firefighter dies #exercising #soundtr...
Name: tweet, Length: 9589, dtype: object

In [13]:
from nltk.tokenize import RegexpTokenizer

# Create a tokenizer pattern that preserves hashtags
tokenizer = RegexpTokenizer(r'\w+|#\w+')

# Tokenize X_train
X_train_tokens = X_train_cleaned.apply(tokenizer.tokenize)

# Tokenize X_test
X_test_tokens = X_test_cleaned.apply(tokenizer.tokenize)


In [14]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
# Apply stemming to X_train_tokens
X_train_stemmed = X_train_tokens.apply(lambda x: [stemmer.stem(word) for word in x])

# Apply stemming to X_test_tokens
X_test_stemmed = X_test_tokens.apply(lambda x: [stemmer.stem(word) for word in x])


In [15]:
X_test_stemmed

12227    [mom, say, smile, captiv, say, happi, sunday, ...
14709    [day, meet, si, law, couney, bower, first, vel...
19319    [hate, conserv, homophob, use, tragedi, way, s...
4308     [awe, #scream, #friday, #acewellstuck, #cynthi...
24055    [fathersday, #father, #day, #god, #toni, #smit...
                               ...                        
20593                           [#model, love, take, time]
11682    [afraid, done, noth, wrong, salla, pura, barba...
10882    [week, till, perform, showcas, #actor, #perfor...
6084     [real, #republican, chanc, take, pay, back, te...
6952     [lincoln, #firefight, die, #exercis, #soundtra...
Name: tweet, Length: 9589, dtype: object

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data and transform the text into TF-IDF features
X_train_features = vectorizer.fit_transform(X_train_stemmed.apply(lambda x: ' '.join(x)))

# Transform the test data into TF-IDF features
X_test_features = vectorizer.transform(X_test_stemmed.apply(lambda x: ' '.join(x)))

# Print the shape of the feature matrices
print("Shape of X_train_features:", X_train_features.shape)
print("Shape of X_test_features:", X_test_features.shape)

Shape of X_train_features: (22373, 24981)
Shape of X_test_features: (9589, 24981)


In [17]:
vocabulary = vectorizer.vocabulary_

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
#initialize the count vectorizer
vectorizer2 = CountVectorizer()

# Fit the vectorizer on the training data and transform the text into count features
Xtrain_features = vectorizer2.fit_transform(X_train_stemmed.apply(lambda x: ' '.join(x)))

# Transform the test data into count features
Xtest_features = vectorizer2.transform(X_test_stemmed.apply(lambda x: ' '.join(x)))

# Print the shape of the feature matrices
print("Shape of Xtrain_features:", Xtrain_features.shape)
print("Shape of Xtest_features:", Xtest_features.shape)


Shape of Xtrain_features: (22373, 24981)
Shape of Xtest_features: (9589, 24981)


In [19]:
#MODEL SELECTION WITH TF-IDF CLASSIFIER 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_features, y_train)
nb_predictions = nb_classifier.predict(X_test_features)

# Create SVM classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_features, y_train)
svm_predictions = svm_classifier.predict(X_test_features)

# Create logistic regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_features, y_train)
lr_predictions = lr_classifier.predict(X_test_features)

# Create random forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_features, y_train)
rf_predictions = rf_classifier.predict(X_test_features)

# Evaluate the performance of each classifier
nb_accuracy = accuracy_score(y_test, nb_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)
lr_accuracy = accuracy_score(y_test, lr_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)

print("Naive Bayes Accuracy:", nb_accuracy)
print("SVM Accuracy:", svm_accuracy)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Random Forest Accuracy:", rf_accuracy)


Naive Bayes Accuracy: 0.9394097403274585
SVM Accuracy: 0.9553655229950986
Logistic Regression Accuracy: 0.9463969131296277
Random Forest Accuracy: 0.9587026801543436


In [20]:
#F-1 SCORE WITH TF-IDF
from sklearn.metrics import f1_score

# Calculate the F1 score for Naive Bayes
nb_f1_score = f1_score(y_test, nb_predictions)

# Calculate the F1 score for SVM
svm_f1_score = f1_score(y_test, svm_predictions)

# Calculate the F1 score for Logistic Regression
lr_f1_score = f1_score(y_test, lr_predictions)

# Calculate the F1 score for Random Forest
rf_f1_score = f1_score(y_test, rf_predictions)

print("Naive Bayes F1 Score:", nb_f1_score)
print("SVM F1 Score:", svm_f1_score)
print("Logistic Regression F1 Score:", lr_f1_score)
print("Random Forest F1 Score:", rf_f1_score)


Naive Bayes F1 Score: 0.26175349428208383
SVM F1 Score: 0.5694164989939638
Logistic Regression F1 Score: 0.43392070484581496
Random Forest F1 Score: 0.6373626373626374


In [21]:
#MODEL SELECTION WITH COUNT VECTORIZER
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create Naive Bayes classifier
nb_classifier2 = MultinomialNB()
nb_classifier2.fit(Xtrain_features, y_train)
nb_predictions2 = nb_classifier2.predict(Xtest_features)

# Create SVM classifier
svm_classifier2 = SVC()
svm_classifier2.fit(Xtrain_features, y_train)
svm_predictions2 = svm_classifier2.predict(Xtest_features)

# Create logistic regression classifier
lr_classifier2 = LogisticRegression()
lr_classifier2.fit(Xtrain_features, y_train)
lr_predictions2 = lr_classifier2.predict(Xtest_features)

# Create random forest classifier
rf_classifier2 = RandomForestClassifier()
rf_classifier2.fit(Xtrain_features, y_train)
rf_predictions2 = rf_classifier2.predict(Xtest_features)

# Evaluate the performance of each classifier
nb_accuracy2 = accuracy_score(y_test, nb_predictions2)
svm_accuracy2 = accuracy_score(y_test, svm_predictions2)
lr_accuracy2 = accuracy_score(y_test, lr_predictions2)
rf_accuracy2 = accuracy_score(y_test, rf_predictions2)

print("Naive Bayes Accuracy:", nb_accuracy2)
print("SVM Accuracy:", svm_accuracy2)
print("Logistic Regression Accuracy:", lr_accuracy2)
print("Random Forest Accuracy:", rf_accuracy2)


Naive Bayes Accuracy: 0.9574512462196266
SVM Accuracy: 0.9547398060277401
Logistic Regression Accuracy: 0.9564083846073625
Random Forest Accuracy: 0.9576598185420795


In [22]:
#F-1 SCORE WITH COUNT VECTORIZER
from sklearn.metrics import f1_score

# Calculate the F1 score for Naive Bayes
nb_f1_score2 = f1_score(y_test, nb_predictions2)

# Calculate the F1 score for SVM
svm_f1_score2 = f1_score(y_test, svm_predictions2)

# Calculate the F1 score for Logistic Regression
lr_f1_score2 = f1_score(y_test, lr_predictions2)

# Calculate the F1 score for Random Forest
rf_f1_score2 = f1_score(y_test, rf_predictions2)

print("Naive Bayes F1 Score:", nb_f1_score2)
print("SVM F1 Score:", svm_f1_score2)
print("Logistic Regression F1 Score:", lr_f1_score2)
print("Random Forest F1 Score:", rf_f1_score2)


Naive Bayes F1 Score: 0.6236162361623616
SVM F1 Score: 0.5633802816901409
Logistic Regression F1 Score: 0.6186131386861313
Random Forest F1 Score: 0.6268382352941176


In [24]:
#Decided to use Random Forest with TF-IDF to build the model 

In [23]:
test_data = pd.read_csv('test_tweets_anuFYb8.csv')
test_data

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [24]:
#remove special characters that are not # and a-z
test_data['tweet'] = test_data['tweet'].str.replace('[^a-zA-Z#]', ' ', regex=True)

#remove user 
test_data['tweet']= test_data['tweet'].str.replace('user','')

test_data['tweet']


0        #studiolife #aislife #requires #passion #dedic...
1           #white #supremacists want everyone to see t...
2        safe ways to heal your #acne      #altwaystohe...
3        is the hp and the cursed child book up for res...
4           rd #bihday to my amazing  hilarious #nephew...
                               ...                        
17192    thought factory  left right polarisation  #tru...
17193    feeling like a mermaid      #hairflip #neverre...
17194    #hillary #campaigned today in #ohio  omg    am...
17195    happy  at work conference  right mindset leads...
17196    my   song  so glad  free download   #shoegaze ...
Name: tweet, Length: 17197, dtype: object

In [25]:
#remove additional spaces 
test_data['tweet'] = test_data['tweet'].apply(lambda x: re.sub(r'\s+', ' ', x))

#remove words that are less than 3 in length
test_data['tweet'] = test_data['tweet'].apply(lambda x: ' '.join([word for word in x.split() if len(word) >= 3]))

#convert all words to lowercase
test_data['tweet'] = test_data['tweet'].str.lower()
test_data['tweet']

0        #studiolife #aislife #requires #passion #dedic...
1        #white #supremacists want everyone see the new...
2        safe ways heal your #acne #altwaystoheal #heal...
3        the and the cursed child book for reservations...
4        #bihday amazing hilarious #nephew eli ahmir un...
                               ...                        
17192    thought factory left right polarisation #trump...
17193    feeling like mermaid #hairflip #neverready #fo...
17194    #hillary #campaigned today #ohio omg amp used ...
17195    happy work conference right mindset leads cult...
17196    song glad free download #shoegaze #newmusic #n...
Name: tweet, Length: 17197, dtype: object

In [26]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Apply stopword removal to tweet column
test_data['tweet_cleaned'] = test_data['tweet'].apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in stop_words))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
#drop the initial tweet column 
test_data = test_data.drop('tweet',axis = 1)



In [28]:
test_data

Unnamed: 0,id,tweet_cleaned
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,#white #supremacists want everyone see new #bi...
2,31965,safe ways heal #acne #altwaystoheal #healthy #...
3,31966,cursed child book reservations already yes #ha...
4,31967,#bihday amazing hilarious #nephew eli ahmir un...
...,...,...
17192,49155,thought factory left right polarisation #trump...
17193,49156,feeling like mermaid #hairflip #neverready #fo...
17194,49157,#hillary #campaigned today #ohio omg amp used ...
17195,49158,happy work conference right mindset leads cult...


In [29]:
from nltk.tokenize import RegexpTokenizer

# Create a tokenizer pattern that preserves hashtags
tokenizer = RegexpTokenizer(r'\w+|#\w+')

# Tokenize tweet cleaned column
tweet_cleaned_tokens = test_data['tweet_cleaned'].apply(tokenizer.tokenize)



In [30]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
# Apply stemming to tweet cleaned column
tweet_cleaned_stemmed = tweet_cleaned_tokens.apply(lambda x: [stemmer.stem(word) for word in x])



In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
new_vectorizer = TfidfVectorizer(vocabulary=vocabulary)

# Fit the vectorizer on the column and transform the text into TF-IDF features
vectorised_tweet_cleaned = new_vectorizer.fit_transform(tweet_cleaned_stemmed.apply(lambda x: ' '.join(x)))




In [32]:
vectorised_tweet_cleaned


<17197x24981 sparse matrix of type '<class 'numpy.float64'>'
	with 110582 stored elements in Compressed Sparse Row format>

In [33]:
predictions = rf_classifier.predict(vectorised_tweet_cleaned)

In [34]:
predictions_df = pd.DataFrame({'predicted_label':predictions})
test_data_predicted = test_data.join(predictions_df)
test_data_predicted

Unnamed: 0,id,tweet_cleaned,predicted_label
0,31963,#studiolife #aislife #requires #passion #dedic...,0
1,31964,#white #supremacists want everyone see new #bi...,1
2,31965,safe ways heal #acne #altwaystoheal #healthy #...,0
3,31966,cursed child book reservations already yes #ha...,0
4,31967,#bihday amazing hilarious #nephew eli ahmir un...,0
...,...,...,...
17192,49155,thought factory left right polarisation #trump...,1
17193,49156,feeling like mermaid #hairflip #neverready #fo...,0
17194,49157,#hillary #campaigned today #ohio omg amp used ...,0
17195,49158,happy work conference right mindset leads cult...,0


In [37]:
test_data_predicted.loc[test_data_predicted['predicted_label']==1]

Unnamed: 0,id,tweet_cleaned,predicted_label
1,31964,#white #supremacists want everyone see new #bi...,1
19,31982,thought factory bbc neutrality right wing fasc...,1
26,31989,chick gets fucked hottest naked lady,1
33,31996,suppo #taiji fisherman bullying racism #tweet ...,1
110,32073,hey ivanka bracelet feel good profiting #xenop...,1
...,...,...,...
17125,49088,careful criticizing #obama decision #israel am...,1
17128,49091,government new #anti semitism definition confl...,1
17176,49139,racist pay ever,1
17188,49151,black professor demonizes proposes nazi style ...,1


In [38]:
test_data_predicted.loc[test_data_predicted['predicted_label']==0]

Unnamed: 0,id,tweet_cleaned,predicted_label
0,31963,#studiolife #aislife #requires #passion #dedic...,0
2,31965,safe ways heal #acne #altwaystoheal #healthy #...,0
3,31966,cursed child book reservations already yes #ha...,0
4,31967,#bihday amazing hilarious #nephew eli ahmir un...,0
5,31968,choose #momtips,0
...,...,...,...
17191,49154,damn tuff ruff muff techno city web ukhx int #...,0
17193,49156,feeling like mermaid #hairflip #neverready #fo...,0
17194,49157,#hillary #campaigned today #ohio omg amp used ...,0
17195,49158,happy work conference right mindset leads cult...,0
