# Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Data Preprocessing

In [3]:
twitter_df = pd.read_csv('/content/twitter_data.csv')

In [4]:
twitter_df.shape

(3318, 6)

In [5]:
twitter_df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3318 entries, 0 to 3317
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  3318 non-null   int64 
 1   ids     3318 non-null   int64 
 2   date    3318 non-null   object
 3   flag    3318 non-null   object
 4   user    3318 non-null   object
 5   text    3318 non-null   object
dtypes: int64(2), object(4)
memory usage: 155.7+ KB


In [7]:
twitter_df.describe()

Unnamed: 0,target,ids
count,3318.0,3318.0
mean,0.0,1468200000.0
std,0.0,215975.5
min,0.0,1467810000.0
25%,0.0,1468015000.0
50%,0.0,1468206000.0
75%,0.0,1468387000.0
max,0.0,1468567000.0


In [8]:
twitter_df.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [9]:
x = twitter_df['text'].values

In [10]:
x

array(["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
       "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
       '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
       ...,
       '@mauricedb I tried the Sitecom one to, but felt to toy-like ',
       'I will never like getting up at 5 am  dumb work schedule.',
       'need cuddling now '], dtype=object)

# Stemming

In [11]:
port_stem = PorterStemmer()

In [12]:
def stemming(content):
  stem_content = re.sub('[^a-zA-Z]',' ', str(content))
  stem_content = stem_content.lower()
  stem_content = stem_content.split()
  stem_content = [port_stem.stem(word) for word in stem_content if not word in stopwords.words('english')]
  stem_content = ' '.join(stem_content)
  return stem_content

In [13]:
twitter_df['text'] = twitter_df['text'].apply(stemming)

In [14]:
x = twitter_df['text']

In [15]:
x

0       switchfoot http twitpic com zl awww bummer sho...
1       upset updat facebook text might cri result sch...
2       kenichan dive mani time ball manag save rest g...
3                         whole bodi feel itchi like fire
4                           nationwideclass behav mad see
                              ...                        
3313    amber know often check realli miss right like ...
3314              ate mani oreo yesterday feel sick today
3315              mauricedb tri sitecom one felt toy like
3316                     never like get dumb work schedul
3317                                           need cuddl
Name: text, Length: 3318, dtype: object

# Sentiment Analysis

In [20]:
sentiment = SentimentIntensityAnalyzer()

In [27]:
def determine(text):
  a = sentiment.polarity_scores(text)
  key_max = max(a, key = a.get)
  return key_max

In [28]:
twitter_df['Result'] = twitter_df['text'].apply(determine)

In [29]:
twitter_df['Result']

0            neu
1            neu
2            neu
3            neu
4            neg
          ...   
3313    compound
3314         neu
3315         neu
3316         neu
3317         neu
Name: Result, Length: 3318, dtype: object

In [32]:
twitter_df['Result'].value_counts()

neu         2613
neg          324
compound     271
pos          110
Name: Result, dtype: int64

In [None]:
# Hence we Analysing the Twitter sentiment 
# Here
# neu -> netural text
# neg -> negative text
# pos -> positive text
# compound -> either positve and negative text 