In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df = pd.read_csv("NLP_train.csv")
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


NO NULL values, but there are many special or non-alphanumeric characters in the tweet column

In [5]:
# First identifying if there are any duplicates in whole dataframe and dropping the values
df.duplicated().sum()

0

There are no duplicate values as well

In [6]:
# Second, converting the data into lower case. Since tweet is the only text column:
df['tweet'] = df['tweet'].apply(lambda x: x.lower())
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [7]:
# Removing non-alphanumeric characters using regex
import re
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'[^\w\s]+', ' ', x))

# Removing the charcaters like, ðÃ¤Â½Ã©Ã¥Â…Ã¦ from the tweet column
df['tweet'] = df['tweet'].apply(lambda x: x.encode('iso-8859-1').decode('utf-8', 'ignore'))                                
df

Unnamed: 0,id,label,tweet
0,1,0,user when a father is dysfunctional and is s...
1,2,0,user user thanks for lyft credit i can t us...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in ...
4,5,0,factsguide society now motivation
...,...,...,...
31957,31958,0,ate user isz that youuu
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,user sikh temple vandalised in in calgary ...


Now the data is fully cleaned with only alphanumeric and without any special characters

##### Since the test data has no instances of target variable, in order to test the model, splitting the train dataset into training and evaluation dataset

In [8]:
df['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

The target class ie.'label' is unbalanced

In [9]:
X, y = df['tweet'], df['label']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Using the Bag-of-words technique with Countvectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
# Creating a CountVectorizer object
vectorizer = CountVectorizer(stop_words='english')
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [11]:
# from imblearn.over_sampling import SMOTE
# smote = SMOTE(random_state = 42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vect, y_train)

In [12]:
# from sklearn.cluster import KMeans
# kmeans = KMeans(n_clusters = 20)
# kmeans.fit_transform(X_train_vect)
# kmeans.transform(X_test_vect)

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vect, y_train)

In [14]:
lr.score(X_train_vect, y_train)

0.984590715319332

In [15]:
lr.score(X_test_vect, y_test)

0.9594869388393555

In [16]:
y_pred = lr.predict(X_test_vect)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5937
           1       0.85      0.53      0.65       456

    accuracy                           0.96      6393
   macro avg       0.91      0.76      0.81      6393
weighted avg       0.96      0.96      0.96      6393

[[5893   44]
 [ 215  241]]


In [17]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(f1_score(y_test, y_pred))

0.6504723346828611


In [18]:
print(precision_score(y_test, y_pred))

0.8456140350877193


In [19]:
print(recall_score(y_test, y_pred))

0.5285087719298246


In [20]:
X, y = df['tweet'], df['label']
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#from sklearn.feature_extraction.text import CountVectorizer
# Creating a CountVectorizer object
vectorizer_rf = CountVectorizer(stop_words='english')
X_train_vect_rf = vectorizer_rf.fit_transform(X_train)
X_test_vect_rf = vectorizer_rf.transform(X_test)

In [22]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_vect_rf, y_train)

In [23]:
rf.score(X_train_vect_rf, y_train)

0.9997653408424264

In [24]:
rf.score(X_test_vect_rf, y_test)

0.9583919912404192

In [25]:
y_pred_rf = rf.predict(X_test_vect_rf)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5937
           1       0.87      0.49      0.63       456

    accuracy                           0.96      6393
   macro avg       0.92      0.74      0.80      6393
weighted avg       0.96      0.96      0.95      6393

[[5905   32]
 [ 234  222]]


In [26]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(f1_score(y_test, y_pred_rf))

0.6253521126760563


In [27]:
print(precision_score(y_test, y_pred_rf))

0.8740157480314961


In [28]:
print(recall_score(y_test, y_pred_rf))

0.4868421052631579


#### Using Tfidf technique

In [19]:
X, y = df['tweet'], df['label']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Creating a TfidfVectorizer object
tfid_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidvect = tfid_vectorizer.fit_transform(X_train)
X_test_tfidvect = tfid_vectorizer.transform(X_test)

In [21]:
# smote_tfidf = SMOTE(random_state=42)
# X_train_resampled_tfidf, y_train_resampled_tfidf = smote_tfidf.fit_resample(X_train_tfidvect, y_train)

In [22]:
from sklearn.linear_model import LogisticRegression
lr_tfid = LogisticRegression()
lr_tfid.fit(X_train_tfidvect, y_train)

In [23]:
lr_tfid.score(X_train_tfidvect, y_train)

0.9533028276428488

In [24]:
lr_tfid.score(X_test_vect, y_test)

0.9197559831065227

In [25]:
y_pred_tfid = lr.predict(X_test_tfidvect)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred_tfid))
print(confusion_matrix(y_test, y_pred_tfid))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      5937
           1       1.00      0.00      0.00       456

    accuracy                           0.93      6393
   macro avg       0.96      0.50      0.48      6393
weighted avg       0.93      0.93      0.89      6393

[[5937    0]
 [ 455    1]]


In [26]:
print(f1_score(y_test, y_pred_tfid))
print(precision_score(y_test, y_pred_tfid))
print(recall_score(y_test, y_pred_tfid))

0.00437636761487965
1.0
0.0021929824561403508


CountVectorizer works better than TfidVectorizer by considering the f1 score which is the harmonic mean of precision and recall

#### Using sentimemt analysis

In [27]:
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()

In [28]:
# tweet_sentiment = sentiment_analyzer.polarity_scores(df['tweet'].to_string())
# df['sentiment_score'] = tweet_sentiment['compound']
df['sentiment_score'] = df['tweet'].apply(lambda x: sentiment_analyzer.polarity_scores(x)['compound'])

In [29]:
# Split the data into training and testing sets
X, y = df['sentiment_score'].values.reshape(-1, 1), df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Fit a logistic regression model on the sentiment scores
lr_sentiment = LogisticRegression()
lr_sentiment.fit(X_train, y_train)

In [31]:
lr_sentiment.score(X_train, y_train)

0.9301497907622511

In [32]:
lr_sentiment.score(X_test, y_test)

0.9286719849835758

In [33]:
y_pred_sentiment = lr_sentiment.predict(X_test.reshape(-1, 1))
print(classification_report(y_test, y_pred_sentiment))
print(confusion_matrix(y_test, y_pred_sentiment))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      5937
           1       0.00      0.00      0.00       456

    accuracy                           0.93      6393
   macro avg       0.46      0.50      0.48      6393
weighted avg       0.86      0.93      0.89      6393

[[5937    0]
 [ 456    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
print(f1_score(y_test, y_pred_sentiment))

0.0


In [35]:
print(precision_score(y_test, y_pred_sentiment))

0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
print(recall_score(y_test, y_pred_sentiment))

0.0


#### Using the AFINN sentiment analyzer

In [37]:
#!pip install afinn

from afinn import Afinn

afinn = Afinn()

# Define a function to calculate sentiment scores for each tweet
def get_sentiment_score(tweet):
    return afinn.score(tweet)

In [38]:
# Add the sentiment scores to the dataset as a new column
df['sentiment_score_new'] = df['tweet'].apply(get_sentiment_score)

In [39]:
# Separate the input features and the target variable
X = df['sentiment_score_new']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
lr_affin = LogisticRegression()
lr_affin.fit(X_train.values.reshape(-1, 1), y_train)

In [41]:
lr_affin.score(X_train.values.reshape(-1, 1), y_train)

0.9289373851147874

In [42]:
lr_affin.score(X_test.values.reshape(-1, 1), y_test)

0.9280463006413264

In [43]:
y_pred_affin = lr_affin.predict(X_test.values.reshape(-1, 1))

In [44]:
print(classification_report(y_test, y_pred_affin))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      5937
           1       0.30      0.01      0.01       456

    accuracy                           0.93      6393
   macro avg       0.61      0.50      0.49      6393
weighted avg       0.88      0.93      0.89      6393



In [45]:
print(confusion_matrix(y_test, y_pred_affin))

[[5930    7]
 [ 453    3]]


In [46]:
print(f1_score(y_test, y_pred_affin))

0.012875536480686692


In [47]:
print(precision_score(y_test, y_pred_affin))

0.3


In [48]:
print(recall_score(y_test, y_pred_affin))

0.006578947368421052


Both last two models performed badly

# Final Recommendation:

Logistic model performed well with the use of technique Bag-of-words ie.c ountvectorizer