## Sentiment Analysis


- 1 VADER (Valence Aware Dictionary and Sentiment Reasoner)- Bag of Words approach
- 2 Roberta Pretrained Model From Hugging Face
- 3 HuggingFace Pipelinem

### Import the Packages

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vaishnavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Read the data 

In [5]:
DATASET_COLUMNS=['target','ids','date','flag','User','text']
DATASET_ENCODING = "ISO-8859-1"

data = pd.read_csv(r"C:\Users\Vaishnavi\Desktop\Codec Technologies Internship\Sentiment_analysis_Tweepy\training.1600000.processed.noemoticon.csv", names=DATASET_COLUMNS, header = None, encoding=DATASET_ENCODING)

data.head()

Unnamed: 0,target,ids,date,flag,User,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
data.tail()

Unnamed: 0,target,ids,date,flag,User,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [7]:
data.columns

Index(['target', 'ids', 'date', 'flag', 'User', 'text'], dtype='object')

In [8]:
data.shape

(1600000, 6)

In [9]:
data.isnull().sum()

target    0
ids       0
date      0
flag      0
User      0
text      0
dtype: int64

In [10]:
data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [11]:
data.replace({'target':{4:1}}, inplace = True)

In [12]:
data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

**Stemming** is the process of reducing  a word to its Root word

eg: actor,actress,acting = act

In [13]:
port_stem = PorterStemmer()

In [14]:
import re

def stemming(content):
    stemmed_content = re.sub('[^a-z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [15]:
data['stemmed_content']= data['text'].apply(stemming)  # 50 min to run

In [33]:
data.head()

Unnamed: 0,target,ids,date,flag,User,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl www bummer ou s...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat acebook text might cri result choo...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,enichan dive mani time ball anag save rest go ...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [34]:
print(data['stemmed_content'])

0          switchfoot http twitpic com zl www bummer ou s...
1          upset updat acebook text might cri result choo...
2          enichan dive mani time ball anag save rest go ...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                   ust woke ave school best feel ever
1599996    com eri cool hear old alt interview http blip ...
1599997                                readi akeov sk detail
1599998        appi th irthday boo alll time upac maru hakur
1599999              happi charitytuesday park hariti peak p
Name: stemmed_content, Length: 1600000, dtype: object


In [35]:
print(data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [48]:
# separating the data and label

X = data['stemmed_content'].values
Y = data['target'].values

In [49]:
print(X)

['switchfoot http twitpic com zl www bummer ou shoulda got avid arr hird ay'
 'upset updat acebook text might cri result chool today also lah'
 'enichan dive mani time ball anag save rest go bound' ...
 'readi akeov sk detail' 'appi th irthday boo alll time upac maru hakur'
 'happi charitytuesday park hariti peak p']


In [38]:
print(Y)

[0 0 0 ... 1 1 1]


Splitting the data to training data and test data

In [50]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    stratify = Y,
                                                    random_state =2, 
                                                    test_size = 0.2)


In [51]:
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1280000,) (320000,)


In [52]:
print(Y.shape, Y_train.shape, Y_test.shape)

(1600000,) (1280000,) (320000,)


In [53]:
print(X_train)

['watch saw iv drink lil wine' 'ater agazin'
 'even though favourit drink think vodka coke wipe mind time think im gonna find new drink'
 ... 'eager onday afternoon'
 'ope everyon mother great day wait hear guy store tomorrow'
 'love wake olger oo bad voic deeper']


In [54]:
print(X_test)

['mmangen fine much time chat witter ubbi back summer amp tend domin free time'
 'may show w ruth kim amp geoffrey sanhueza'
 'hatara mayb bay area thang dammit' ...
 'estini evertheless ooray member wonder safe trip' 'ot feel well'
 'supersandro thank']


In [55]:
## Converting the text data to numerical data

vector = TfidfVectorizer()
X_train = vector.fit_transform(X_train)
X_test = vector.transform(X_test)

In [56]:
print(X_train)

  (0, 383704)	0.279414205738679
  (0, 315485)	0.3608574157796715
  (0, 172467)	0.5275164637725119
  (0, 95080)	0.37648839229025866
  (0, 209161)	0.4209672603058213
  (0, 388899)	0.4409978047320551
  (1, 30855)	0.6480214968810323
  (1, 6659)	0.7616220450985293
  (2, 95080)	0.4592564321165316
  (2, 115389)	0.18903771482662873
  (2, 355021)	0.1850782440837588
  (2, 119867)	0.28756711118712547
  (2, 354311)	0.3203489929141377
  (2, 381203)	0.33099443042592025
  (2, 71573)	0.3177696896627875
  (2, 389184)	0.33193616195136694
  (2, 233229)	0.23990301778446163
  (2, 356431)	0.15228355976619753
  (2, 163552)	0.16466667077394484
  (2, 135654)	0.18906520250836514
  (2, 122070)	0.2009992970243669
  (2, 249370)	0.17045312952499175
  (3, 354311)	0.2904555821806978
  (3, 141998)	0.44332511477340214
  (3, 136416)	0.2824268913408121
  :	:
  (1279996, 172554)	0.2784976012161532
  (1279996, 381539)	0.2953264080016052
  (1279996, 342270)	0.23715205746526544
  (1279996, 168926)	0.3200135356270418
  (12799

In [57]:
print(X_test)

  (0, 16769)	0.16505689921299113
  (0, 36972)	0.16014635597220114
  (0, 63916)	0.25999468282673965
  (0, 92688)	0.361559251425862
  (0, 122109)	0.2477056499118546
  (0, 126386)	0.23382090765737787
  (0, 236167)	0.4366506386919956
  (0, 241899)	0.1730307620333418
  (0, 340942)	0.21684179631380954
  (0, 349120)	0.3371885395884792
  (0, 356431)	0.30837355939663935
  (0, 367471)	0.33208826164979577
  (0, 389656)	0.21461585460147892
  (1, 16769)	0.2133769448185924
  (1, 131739)	0.6238827447720454
  (1, 195339)	0.4469375332332968
  (1, 225925)	0.3034414193510518
  (1, 310785)	0.4576962672663301
  (1, 324168)	0.252833687501498
  (2, 25094)	0.3480648539965678
  (2, 39385)	0.39041587376721537
  (2, 80549)	0.3781244773759328
  (2, 143915)	0.576095988590276
  (2, 225966)	0.26620135152336305
  (2, 350498)	0.4250863852317962
  :	:
  (319994, 354311)	0.1936754945342198
  (319994, 355021)	0.22378793904967575
  (319994, 384056)	0.22314373581721345
  (319994, 389545)	0.2759927359268016
  (319995, 95140

In [58]:
## Trainin the machine learning model
# Logistic Regression

model = LogisticRegression(max_iter = 1000)
model.fit(X_train, Y_train)


## Model Evaluation

In [59]:
# Accuracy Score on training data
X_train_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_pred)
print("Accuracy score on training data:", training_data_accuracy)

Accuracy score on training data: 0.79416953125


In [60]:
# Accuracy score on test data
X_test_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_pred)
print("Accuracy score on test data:", test_data_accuracy)

Accuracy score on test data: 0.767846875


**Model Accuracy : 76.8%**

## Saving the train model

In [72]:
import pickle

# Assuming 'model' is your trained classifier
with open('Sentiment_Analysis.pkl', 'wb') as file:
    pickle.dump(model, file)


In [74]:
with open('Sentiment_Analysis_Prediction1.pkl', 'wb') as file:
    pickle.dump((vector, model), file)

### Using the saved model for future predictions

In [64]:
# Loading the saved model

loaded_model = pickle.load(open(r"C:\Users\Vaishnavi\Desktop\Codec Technologies Internship\Sentiment_analysis_Tweepy\sentiment_analysis_model.sav",'rb'))

In [66]:
X_new = X_test[200]
print(Y_test[200])
prediction = model.predict(X_new)
print(prediction)

if (prediction[0] == 0):
    print('Negative Tweet')
else:
    print('Positive Tweet')

1
[1]
Positive Tweet


In [67]:
X_new = X_test[3]
print(Y_test[3])
prediction = model.predict(X_new)
print(prediction)

if (prediction[0] == 0):
    print('Negative Tweet')
else:
    print('Positive Tweet')

0
[0]
Negative Tweet
