# Sentiment Analysis on Twitter: Machine Learning Approach

Model used: Logistic Regression, Support Vector Classifier and Naive Bayes

Best Score without pre-processing (TF-IDF Vectorizer):

77% Accuracy on SVC

Best Score after pre-processing (TF-IDF Vectorizer):

78% Accuracy on SVC

# Imports

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np
import re
from sklearn.utils import resample
import nltk
from nltk.corpus import stopwords


In [15]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Dataset import

In [2]:
df1 = pd.read_csv("/content/drive/MyDrive/Bracu/Spring2022/CSE440/Project/Dataset/twitter-2013dev-A.txt", sep = '\t', names = ['id','sentiment','tweets'])
df1 = df1.drop(labels=["id"], axis = 1)
df2 = pd.read_csv("/content/drive/MyDrive/Bracu/Spring2022/CSE440/Project/Dataset/twitter-2013test-A.txt", sep = '\t', names = ['id','sentiment','tweets'])
df2 = df2.drop(labels=["id"], axis = 1)
df3 = pd.read_csv("/content/drive/MyDrive/Bracu/Spring2022/CSE440/Project/Dataset/twitter-2013train-A.txt", sep = '\t', names = ['id','sentiment','tweets'])
df3 = df3.drop(labels=["id"], axis = 1)
df4 = pd.read_csv("/content/drive/MyDrive/Bracu/Spring2022/CSE440/Project/Dataset/twitter-2014sarcasm-A.txt", sep = '\t', names = ['id','sentiment','tweets'])
df4 = df4.drop(labels=["id"], axis = 1)
df5 = pd.read_csv("/content/drive/MyDrive/Bracu/Spring2022/CSE440/Project/Dataset/twitter-2015test-A.txt", sep = '\t', names = ['id','sentiment','tweets'])
df5 = df5.drop(labels=["id"], axis = 1)
df6 = pd.read_csv("/content/drive/MyDrive/Bracu/Spring2022/CSE440/Project/Dataset/twitter-2015train-A.txt", sep = '\t', names = ['id','sentiment','tweets'])
df6 = df6.drop(labels=["id"], axis = 1)
df7 = pd.read_csv("/content/drive/MyDrive/Bracu/Spring2022/CSE440/Project/Dataset/twitter-2016dev-A.txt", sep = '\t', names = ['id','sentiment','tweets'])
df7 = df7.drop(labels=["id"], axis = 1)
df8 = pd.read_csv("/content/drive/MyDrive/Bracu/Spring2022/CSE440/Project/Dataset/twitter-2016devtest-A.txt", sep = '\t', names = ['id','sentiment','tweets'])
df8 = df8.drop(labels=["id"], axis = 1)
df9 = pd.read_csv("/content/drive/MyDrive/Bracu/Spring2022/CSE440/Project/Dataset/twitter-2016train-A.txt", sep = '\t', names = ['id','sentiment','tweets'])
df9 = df9.drop(labels=["id"], axis = 1)

In [3]:
df = pd.concat([df3,df5,df8,df9]).reset_index(drop=True)

df

Unnamed: 0,sentiment,tweets
0,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,negative,Iranian general says Israel\u2019s Iron Dome c...
4,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...
...,...,...
19937,positive,@Racalto_SK ok good to know. Punting at MetLif...
19938,neutral,everyone who sat around me at metlife was so a...
19939,neutral,what giants or niners fans would wanna go to t...
19940,positive,Anybody want a ticket for tomorrow Colombia vs...


# Balancing the dataset

Our dataset is unbalanced when it comes to negative sentiment. So, we will resample to balance it.

In [4]:
df['sentiment'].value_counts()

positive    8689
neutral     8255
negative    2998
Name: sentiment, dtype: int64

In [5]:
def dataset_balance(data):
    data_positive = data[(data['sentiment']=='positive')]
    data_neutral = data[(data['sentiment']=='neutral')]
    data_negative = data[(data['sentiment']=='negative')]
    data_negative_upsampled = resample(data_negative, replace = True, n_samples =  data['sentiment'].value_counts()[1], random_state = 42)
    balanced_dataset = pd.concat([data_positive,data_neutral,data_negative_upsampled]).reset_index(drop=True)
    return balanced_dataset

In [6]:
df = dataset_balance(df)
df['sentiment'].value_counts()

positive    8689
neutral     8255
negative    8255
Name: sentiment, dtype: int64

# Without Preprocessing
# TF-IDF

In [8]:
def label_encoder(x):
    if x == 'positive': return 2
    elif x == 'negative': return 0
    else: return 1

In [9]:
tf_idf = TfidfVectorizer()
X = tf_idf.fit(df['tweets']).transform(df['tweets'])
Y = df.sentiment.apply(lambda x: label_encoder(x))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, shuffle = True)

In [11]:
model_LR = LogisticRegression(max_iter=10000).fit(X_train,Y_train)
model_SVM = SVC(kernel='rbf', random_state = 1).fit(X_train, Y_train)
model_NB = ComplementNB().fit(X_train,Y_train)

# Score on Test Set

In [12]:
LR_score = classification_report(Y_test, model_LR.predict(X_test))
SVC_score = classification_report(Y_test, model_SVM.predict(X_test))
NB_score = classification_report(Y_test, model_NB.predict(X_test))
print("LOGISTIC REGRESSION")
print(LR_score)
print("SUPPORT VECTOR CLASSIFICATION")
print(SVC_score)
print("NAIVE BAYES CLASSIFICATION")
print(NB_score)

LOGISTIC REGRESSION
              precision    recall  f1-score   support

           0       0.78      0.86      0.82      1635
           1       0.66      0.65      0.65      1687
           2       0.72      0.66      0.69      1718

    accuracy                           0.72      5040
   macro avg       0.72      0.72      0.72      5040
weighted avg       0.72      0.72      0.72      5040

SUPPORT VECTOR CLASSIFICATION
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1635
           1       0.68      0.74      0.70      1687
           2       0.74      0.66      0.70      1718

    accuracy                           0.77      5040
   macro avg       0.77      0.77      0.77      5040
weighted avg       0.77      0.77      0.77      5040

NAIVE BAYES CLASSIFICATION
              precision    recall  f1-score   support

           0       0.66      0.93      0.77      1635
           1       0.72      0.42      0.53      168

# With Preprocessing

1. removed link
2. removed linebreaks
3. removed extra spaces
4. removed punctuation
5. removed stopwords
6. lowercased the tweet
7. extracted hashtags and added at the end of the sentence
8. extracted mentions and added at the end of the sentence

In [13]:
def clean_text(x):
    x = re.sub(r'https?://\S+', '', x) 
    x = re.sub(r'#\w+', '', x) 
    x = re.sub(r'@\w+', '', x) 
    x = re.sub(r'\n',' ',x) 
    x = re.sub('\s+', ' ', x).strip()
    x = re.sub('\.','',x) 
    for p in string.punctuation:
        x = re.sub('\{}'.format(p),'',x)
    return x.lower()
def find_hashtags(tweet):
    return " ".join([match.group(0)[1:] for match in re.finditer(r"#\w+", tweet)]) or ''

def find_mentions(tweet):
    return " ".join([match.group(0)[1:] for match in re.finditer(r"@\w+", tweet)]) or ''

def pre_process_text(df):
    stop = stopwords.words('english')
    df['clean_tweet'] = df['tweets'].apply(lambda x: clean_text(x))
    df['hashtags'] = df['tweets'].apply(lambda x: find_hashtags(x))
    df['mentions'] = df['tweets'].apply(lambda x: find_mentions(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: label_encoder(x))
    df['clean_tweet_without_stopword'] = df['clean_tweet'].apply(lambda x:' '.join([word for word in x.split() if word not in stop]))
    df['final'] = df['clean_tweet_without_stopword']+" "+df['hashtags']+" "+df['mentions']
    # df['clean_tweet']+" "+
    return df


In [16]:
pdf = pd.concat([df3,df5,df8,df9]).reset_index(drop=True)
pdf = dataset_balance(pdf)
pdf = pre_process_text(df)
pdf.tail()

Unnamed: 0,sentiment,tweets,clean_tweet,hashtags,mentions,clean_tweet_without_stopword,final
25194,0,One of my Magic Mike XXL co-workers and the co...,one of my magic mike xxl coworkers and the coo...,,,one magic mike xxl coworkers coordinator premi...,one magic mike xxl coworkers coordinator premi...
25195,0,Sreven Taylor will look pretty daft after the ...,sreven taylor will look pretty daft after the ...,,,sreven taylor look pretty daft mackems win tom...,sreven taylor look pretty daft mackems win tom...
25196,0,i hope justin's concert thursday gets cancelle...,i hope justins concert thursday gets cancelled...,,,hope justins concert thursday gets cancelled b...,hope justins concert thursday gets cancelled b...
25197,0,About as much interest in today as I do in the...,about as much interest in today as i do in the...,zero,,much interest today rugby world cup,much interest today rugby world cup zero
25198,0,@Becker_Boris It\u2019s almost 2 am in BKK but...,itu2019s almost 2 am in bkk but i canu2019t sl...,,Becker_Boris,itu2019s almost 2 bkk canu2019t sleep game isn...,itu2019s almost 2 bkk canu2019t sleep game isn...


In [17]:
tf_idf2 = TfidfVectorizer(ngram_range=(1,3))
X = tf_idf2.fit_transform(pdf['final'])
Y = pdf.sentiment
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, shuffle = True)

In [18]:
model_LR2 = LogisticRegression(max_iter=20000).fit(X_train,Y_train)
model_SVM2 = SVC(kernel='rbf', random_state = 42).fit(X_train, Y_train)
model_NB2 = ComplementNB().fit(X_train,Y_train)

# Score on Test Set

In [19]:
LR_score = classification_report(Y_test, model_LR2.predict(X_test))
SVC_score = classification_report(Y_test, model_SVM2.predict(X_test))
NB_score = classification_report(Y_test, model_NB2.predict(X_test))
print("LOGISTIC REGRESSION")
print(LR_score)
print("SUPPORT VECTOR CLASSIFICATION")
print(SVC_score)
print("NAIVE BAYES CLASSIFICATION")
print(NB_score)

LOGISTIC REGRESSION
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1717
           1       0.66      0.68      0.67      1622
           2       0.73      0.68      0.70      1701

    accuracy                           0.76      5040
   macro avg       0.76      0.76      0.76      5040
weighted avg       0.76      0.76      0.76      5040

SUPPORT VECTOR CLASSIFICATION
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      1717
           1       0.64      0.77      0.70      1622
           2       0.73      0.66      0.69      1701

    accuracy                           0.78      5040
   macro avg       0.79      0.78      0.78      5040
weighted avg       0.79      0.78      0.78      5040

NAIVE BAYES CLASSIFICATION
              precision    recall  f1-score   support

           0       0.69      0.97      0.81      1717
           1       0.71      0.42      0.53      162