# Sentiment Analysis using VADER

In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
from zipfile import ZipFile
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score


In [2]:
# Kaggle API setup commands to download the dataset
!mkdir -p ~/content/kaggle.json
!cp kaggle.json ~/content/kaggle.json
!chmod 600 ~/content/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory


In [3]:
# Download the Sentiment140 dataset from Kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 79% 64.0M/80.9M [00:00<00:00, 106MB/s] 
100% 80.9M/80.9M [00:00<00:00, 99.4MB/s]


In [4]:
# Extract the downloaded zip file
dataset='/content/sentiment140.zip'
with ZipFile(dataset,'r') as zip:
  zip.extractall()

In [5]:
#loading dataset
df=pd.read_csv('/content/training.1600000.processed.noemoticon.csv',encoding= 'ISO-8859-1')
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
df.shape

(1599999, 6)

In [7]:
#removing unnecessary columns
col_name=['target','id','date','flag','user','text']
df=pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names=col_name,encoding= 'ISO-8859-1')
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
df.tail()

Unnamed: 0,target,id,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [9]:
#Sample 100,000 rows from the dataframe for analysis
sampled_df = df.sample(n=500000, random_state=42)

In [10]:
sampled_df.shape

(500000, 6)

In [11]:
sampled_df['target'].value_counts()

target
4    250625
0    249375
Name: count, dtype: int64

In [12]:
sampled_df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [13]:
# Replace the '4' values in 'target' column with '1' to unify positive labels
sampled_df.replace({'target':{4:1}},inplace=True)
sampled_df.tail()

Unnamed: 0,target,id,date,flag,user,text
1085389,1,1969163759,Fri May 29 23:12:12 PDT 2009,NO_QUERY,Izaaza,is going out with Katuty. some pampering to do~
753135,0,2286778000,Mon Jun 22 16:54:48 PDT 2009,NO_QUERY,sega_123,jeanette is sick. and taking a nap. but her b...
466554,0,2175587755,Mon Jun 15 01:24:51 PDT 2009,NO_QUERY,din_heima,wanna make egg custard with coconut milk but t...
1471328,1,2065195932,Sun Jun 07 08:05:55 PDT 2009,NO_QUERY,AliciaSanera,@employerbrander Shh... I read the paper on Su...
151092,0,1932427122,Tue May 26 21:26:46 PDT 2009,NO_QUERY,picklepie1212,@Unknown_Heather oh wow.. that's horrible I w...


In [14]:
sampled_df['target'].value_counts()

target
1    250625
0    249375
Name: count, dtype: int64

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
sampled_df["text"].head()

541200               @chrishasboobs AHHH I HOPE YOUR OK!!! 
750       @misstoriblack cool , i have no tweet apps  fo...
766711    @TiannaChaos i know  just family drama. its la...
285055    School email won't open  and I have geography ...
705995                               upper airways problem 
Name: text, dtype: object

In [17]:
#Creating an object of porter stemmer class
ps=PorterStemmer()
#Function for performing stemming
def stem(text):
    stop_words = set(stopwords.words('english'))
    text=re.sub('[^a-zA-Z]',' ',text)
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    stemmed_words = [ps.stem(word) for word in filtered_words]
    return " ".join(stemmed_words)

In [18]:
sampled_df["processed_text"]=sampled_df["text"].apply(stem)

In [19]:
sampled_df.head()

Unnamed: 0,target,id,date,flag,user,text,processed_text
541200,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!,chrishasboob ahhh hope ok
750,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo...",misstoriblack cool tweet app razr
766711,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...,tiannachao know famili drama lame hey next tim...
285055,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...,school email open geographi stuff revis stupid...
705995,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem,upper airway problem


In [20]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [21]:
def pos_tagging(text):
    words = nltk.word_tokenize(text)
    return nltk.pos_tag(words)

def chunking(pos_tags):
    grammar = "NP: {<DT>?<JJ>*<NN>}"
    cp = nltk.RegexpParser(grammar)
    tree = cp.parse(pos_tags)
    return tree

In [22]:
# Apply POS tagging and chunking to the 'text' column
sampled_df['pos_tags'] = sampled_df['text'].apply(pos_tagging)
sampled_df['chunks'] = sampled_df['pos_tags'].apply(chunking)

# Display a sample of the processed dataframe with EDA columns
print(sampled_df[['text', 'processed_text', 'pos_tags', 'chunks']].head())

                                                     text  \
541200             @chrishasboobs AHHH I HOPE YOUR OK!!!    
750     @misstoriblack cool , i have no tweet apps  fo...   
766711  @TiannaChaos i know  just family drama. its la...   
285055  School email won't open  and I have geography ...   
705995                             upper airways problem    

                                           processed_text  \
541200                          chrishasboob ahhh hope ok   
750                     misstoriblack cool tweet app razr   
766711  tiannachao know famili drama lame hey next tim...   
285055  school email open geographi stuff revis stupid...   
705995                               upper airway problem   

                                                 pos_tags  \
541200  [(@, NN), (chrishasboobs, NN), (AHHH, NNP), (I...   
750     [(@, JJ), (misstoriblack, NN), (cool, NN), (,,...   
766711  [(@, JJ), (TiannaChaos, NNP), (i, NN), (know, ...   
285055  [(School, NNP)

In [23]:
# Create an instance of SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Function to predict sentiment using VADER
def vader_sentiment(text):
    score = sia.polarity_scores(text)
    # Classify as positive if compound score is >= 0, otherwise negative
    return 1 if score['compound'] >= 0 else 0

In [24]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sampled_df['text'], sampled_df['target'], test_size=0.2, random_state=42)


In [25]:
# Apply VADER to the training set
train_predictions = X_train.apply(vader_sentiment)
train_accuracy = accuracy_score(y_train, train_predictions)
print("VADER Training Accuracy:", train_accuracy)

# Apply VADER to the testing set
test_predictions = X_test.apply(vader_sentiment)
test_accuracy = accuracy_score(y_test, test_predictions)
print("VADER Testing Accuracy:", test_accuracy)

VADER Training Accuracy: 0.661705
VADER Testing Accuracy: 0.66297


In [26]:
print(vader_sentiment("it is not great"))

0
