# Twitter Sentiment Analysis using NLP

References for dataset: https://twitter-sentiment-csv.herokuapp.com/

In [1]:
import numpy as np
import pandas as pd
import nltk
import re

In [2]:
dataset = pd.read_csv('sentiment_dataset.csv')
dataset.head(10)

Unnamed: 0,sentiment,text,user
0,positive,"RT @robintheredwon1: Hello! I'm Robin, I'm a s...",thorkylo
1,positive,RT @ImReallyMelody: She was in so many of my f...,chopppak
2,positive,RT @kalyanram94: Win or Lose..\nPolitics or Mo...,SubrahmanyamAi1
3,positive,RT @ImReallyMelody: She was in so many of my f...,Sickassemm
4,neutral,@BounPrem_TH @LongKhongSeries @premspacefamily...,Thanapo76074565
5,neutral,RT @DallasL60050018: @GregCowboys @Montejp231 ...,GregCowboys
6,neutral,"RT @TeamPspkBlr: Leave about Politics, in his ...",Ganesh48373433
7,neutral,RT @TeamPowerSena: The Video glimpse that repl...,nagasai_royal
8,positive,"RT @SerebiiOTD: On this day in 2018, 2 years a...",ShinyHuntMystic
9,positive,RT @PawanKalyanFan: Share your First Day First...,AjithPawanFC


In [3]:
dataset.shape

(100, 3)

In [4]:
from nltk.corpus import stopwords

# Removing the twitter handles(@user)

In [5]:
for i in dataset.index:
    dataset['text'][i] = re.sub('@[\w]*','',dataset['text'][i])
    
dataset.head(10)

Unnamed: 0,sentiment,text,user
0,positive,"RT : Hello! I'm Robin, I'm a smaller artist [m...",thorkylo
1,positive,RT : She was in so many of my favorite movies ...,chopppak
2,positive,RT : Win or Lose..\nPolitics or Movies..\nHit ...,SubrahmanyamAi1
3,positive,RT : She was in so many of my favorite movies ...,Sickassemm
4,neutral,ðŸ“ŒðŸ“ŒðŸ“ŒðŸ“Œ,Thanapo76074565
5,neutral,RT : I rememberâ€¦,GregCowboys
6,neutral,"RT : Leave about Politics, in his movies also ...",Ganesh48373433
7,neutral,RT : The Video glimpse that replicates Team Po...,nagasai_royal
8,positive,"RT : On this day in 2018, 2 years ago, the 21s...",ShinyHuntMystic
9,positive,RT : Share your First Day First movie experien...,AjithPawanFC


In [6]:
for i in dataset.index:
    dataset['text'][i] = re.sub('RT','',dataset['text'][i])
    
dataset.head(10)

Unnamed: 0,sentiment,text,user
0,positive,": Hello! I'm Robin, I'm a smaller artist [mos...",thorkylo
1,positive,: She was in so many of my favorite movies gr...,chopppak
2,positive,: Win or Lose..\nPolitics or Movies..\nHit or...,SubrahmanyamAi1
3,positive,: She was in so many of my favorite movies gr...,Sickassemm
4,neutral,ðŸ“ŒðŸ“ŒðŸ“ŒðŸ“Œ,Thanapo76074565
5,neutral,: I rememberâ€¦,GregCowboys
6,neutral,": Leave about Politics, in his movies also he...",Ganesh48373433
7,neutral,: The Video glimpse that replicates Team Powe...,nagasai_royal
8,positive,": On this day in 2018, 2 years ago, the 21st ...",ShinyHuntMystic
9,positive,: Share your First Day First movie experience...,AjithPawanFC


# Removing the punctuations,numbers and special characters

In [7]:
for i in dataset.index:
    dataset['text'][i] = re.sub('[^a-zA-Z ]','',dataset['text'][i])
    
dataset.head(10)

Unnamed: 0,sentiment,text,user
0,positive,Hello Im Robin Im a smaller artist mostly ho...,thorkylo
1,positive,She was in so many of my favorite movies gro...,chopppak
2,positive,Win or LosenPolitics or MoviesnHit or FlopnA...,SubrahmanyamAi1
3,positive,She was in so many of my favorite movies gro...,Sickassemm
4,neutral,,Thanapo76074565
5,neutral,I remember,GregCowboys
6,neutral,Leave about Politics in his movies also he i...,Ganesh48373433
7,neutral,The Video glimpse that replicates Team Power...,nagasai_royal
8,positive,On this day in years ago the st movie The ...,ShinyHuntMystic
9,positive,Share your First Day First movie experience ...,AjithPawanFC


# Lowering the characters and removing the stopwords

In [8]:
textset = []
for i in dataset.index:
    dataset['text'][i] = re.sub('[^a-zA-Z ]','',dataset['text'][i])
    temp = dataset['text'][i].lower().split()
    clean = [word for word in temp if word not in stopwords.words('english')]
    clean = " ".join(clean)
    textset.append(clean)

In [9]:
textset[0]

'hello im robin im smaller artist mostly hobbyist like drawing animes movies genuin'

# Tokenization

In [10]:
for i in (range(len(textset))):
    textset[i] = textset[i].split()

In [11]:
textset[0]

['hello',
 'im',
 'robin',
 'im',
 'smaller',
 'artist',
 'mostly',
 'hobbyist',
 'like',
 'drawing',
 'animes',
 'movies',
 'genuin']

# Stemmation

In [12]:
from nltk import PorterStemmer

In [13]:
st = PorterStemmer()

In [14]:
for i in (range(len(textset))):
    textset[i] = [st.stem(word) for word in textset[i]]

# Recombining tokens

In [15]:
for i in (range(len(textset))):
    textset[i] = " ".join(textset[i])

In [16]:
textset[0]

'hello im robin im smaller artist mostli hobbyist like draw anim movi genuin'

# Using Tf-Idf Vectorizer

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(textset)

In [19]:
type(X)

scipy.sparse.csr.csr_matrix

In [20]:
X = X.toarray()

In [21]:
type(X)

numpy.ndarray

In [22]:
X.shape

(100, 366)

In [23]:
Y = dataset['sentiment']
Y

0     positive
1     positive
2     positive
3     positive
4      neutral
        ...   
95     neutral
96     neutral
97    positive
98     neutral
99    positive
Name: sentiment, Length: 100, dtype: object

# Splitting into train & test dataset

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=2)

In [26]:
X_train.shape

(80, 366)

In [27]:
Y_train.shape

(80,)

# Applying Logistics Regression

In [28]:
from sklearn.linear_model import LogisticRegression
Log_Reg = LogisticRegression(random_state=0,solver='lbfgs')
Log_Reg.fit(X_train,Y_train)

LogisticRegression(random_state=0)

In [29]:
Y_pred=Log_Reg.predict(X_test)

In [30]:
from sklearn.metrics import *

In [31]:
accuracy_score(Y_test,Y_pred)

0.9

In [32]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
     neutral       1.00      0.80      0.89         5
    positive       0.88      1.00      0.93        14

    accuracy                           0.90        20
   macro avg       0.62      0.60      0.61        20
weighted avg       0.86      0.90      0.88        20



  _warn_prf(average, modifier, msg_start, len(result))


# Applying Gaussian Naive Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB
nv=GaussianNB()
nv.fit(X_train,Y_train)

GaussianNB()

In [34]:
Y_pred=nv.predict(X_test)

In [35]:
accuracy_score(Y_test,Y_pred)

0.7

In [36]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

    negative       0.25      1.00      0.40         1
     neutral       0.67      0.80      0.73         5
    positive       0.90      0.64      0.75        14

    accuracy                           0.70        20
   macro avg       0.61      0.81      0.63        20
weighted avg       0.81      0.70      0.73        20



# Applying Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,Y_train)

RandomForestClassifier()

In [38]:
Y_pred=rf.predict(X_test)

In [39]:
accuracy_score(Y_test,Y_pred)

0.8

In [40]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       0.57      0.80      0.67         5
    positive       0.92      0.79      0.85        14

    accuracy                           0.80        20
   macro avg       0.83      0.86      0.84        20
weighted avg       0.83      0.80      0.81        20



# Applying XGBoost

In [41]:
from xgboost import XGBClassifier
xg = XGBClassifier(random_state=22,learning_rate=0.9)
xg.fit(X_train,Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.9, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=22, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [42]:
Y_pred=xg.predict(X_test)

In [43]:
accuracy_score(Y_test,Y_pred)

0.85

In [44]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       0.67      0.80      0.73         5
    positive       0.92      0.86      0.89        14

    accuracy                           0.85        20
   macro avg       0.86      0.89      0.87        20
weighted avg       0.86      0.85      0.85        20



# Applying rbf kernel SVM

In [45]:
from sklearn.svm import SVC
sv=SVC(kernel='rbf')
sv.fit(X_train,Y_train)

SVC()

In [46]:
Y_pred=sv.predict(X_test)

In [47]:
accuracy_score(Y_test,Y_pred)

0.95

In [48]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       1.00      0.80      0.89         5
    positive       0.93      1.00      0.97        14

    accuracy                           0.95        20
   macro avg       0.98      0.93      0.95        20
weighted avg       0.95      0.95      0.95        20

