In [18]:
import sqlalchemy
import configparser
import pandas as pd
import numpy as np
import re
import nltk

In [19]:
config = configparser.ConfigParser()
config_file = '../config.ini'
config.read(config_file)
default = config['DEFAULT-SQLALCHEMY']
engine = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                            format(default['DB_USER'], default['DB_PASSWORD'], 
                                                    default['DB_IP'], default['DB_DATABASE']))

In [20]:
labelled_df = pd.read_csv('clean_d_tweets.csv')
df = pd.DataFrame(labelled_df['tweet'])
df['tweet'] = df['tweet'].astype(str)
df['tweet_string'] = df['tweet']
df

Unnamed: 0,tweet,tweet_string
0,the real reason why you be sad you be attach t...,the real reason why you be sad you be attach t...
1,my biggest problem be overthinking everything,my biggest problem be overthinking everything
2,the worst sadness be the sadness you have teac...,the worst sadness be the sadness you have teac...
3,i cannot make you understand i cannot make any...,i cannot make you understand i cannot make any...
4,i do not think anyone really understand how ti...,i do not think anyone really understand how ti...
...,...,...
3077,cough sneeze be tho worst,cough sneeze be tho worst
3078,i can be your sad whore ahaha,i can be your sad whore ahaha
3079,bro that feel you get after you sneeze,bro that feel you get after you sneeze
3080,long piss be the best,long piss be the best


#### Tokenization

In [21]:
from nltk.tokenize import word_tokenize

df['tweet'] = df['tweet'].apply(lambda x: word_tokenize(x))

df.head()

Unnamed: 0,tweet,tweet_string
0,"[the, real, reason, why, you, be, sad, you, be...",the real reason why you be sad you be attach t...
1,"[my, biggest, problem, be, overthinking, every...",my biggest problem be overthinking everything
2,"[the, worst, sadness, be, the, sadness, you, h...",the worst sadness be the sadness you have teac...
3,"[i, can, not, make, you, understand, i, can, n...",i cannot make you understand i cannot make any...
4,"[i, do, not, think, anyone, really, understand...",i do not think anyone really understand how ti...


#### Stemming

In [22]:
from nltk.stem import PorterStemmer

porterstemmer = PorterStemmer()
df['port_tweet'] = df['tweet'].apply(lambda x: [porterstemmer.stem(word) for word in x])

df.head()

Unnamed: 0,tweet,tweet_string,port_tweet
0,"[the, real, reason, why, you, be, sad, you, be...",the real reason why you be sad you be attach t...,"[the, real, reason, whi, you, be, sad, you, be..."
1,"[my, biggest, problem, be, overthinking, every...",my biggest problem be overthinking everything,"[my, biggest, problem, be, overthink, everyth]"
2,"[the, worst, sadness, be, the, sadness, you, h...",the worst sadness be the sadness you have teac...,"[the, worst, sad, be, the, sad, you, have, tea..."
3,"[i, can, not, make, you, understand, i, can, n...",i cannot make you understand i cannot make any...,"[i, can, not, make, you, understand, i, can, n..."
4,"[i, do, not, think, anyone, really, understand...",i do not think anyone really understand how ti...,"[i, do, not, think, anyon, realli, understand,..."


#### Removal of Stop words

In [23]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

df['port_tweet'] = df['port_tweet'].apply(remove_stopwords)

df.head()

Unnamed: 0,tweet,tweet_string,port_tweet
0,"[the, real, reason, why, you, be, sad, you, be...",the real reason why you be sad you be attach t...,"[real, reason, whi, sad, attach, peopl, distan..."
1,"[my, biggest, problem, be, overthinking, every...",my biggest problem be overthinking everything,"[biggest, problem, overthink, everyth]"
2,"[the, worst, sadness, be, the, sadness, you, h...",the worst sadness be the sadness you have teac...,"[worst, sad, sad, teach, hide]"
3,"[i, can, not, make, you, understand, i, can, n...",i cannot make you understand i cannot make any...,"[make, understand, make, anyon, understand, ha..."
4,"[i, do, not, think, anyone, really, understand...",i do not think anyone really understand how ti...,"[think, anyon, realli, understand, tire, act, ..."


#### VADER

In [24]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sentiment_analyzer = SentimentIntensityAnalyzer()
df['score'] = df['port_tweet'].apply(lambda x: sentiment_analyzer.polarity_scores(x)['compound'])

df['score'] = np.where(df['score'] < 0, 'Negative', np.where(df['score'] == 0, 'Neutral', 'Positive'))
df['score'].value_counts()

score
Neutral     3017
Negative      38
Positive      27
Name: count, dtype: int64

#### SentiWordNet

In [25]:
def make_sentences(data,name):
    data[name]=data[name].apply(lambda x:' '.join([i+' ' for i in x]))
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

In [26]:
make_sentences(df,'port_tweet')

In [27]:
pos=neg=obj=count=0

postagging = []

for review in df['port_tweet']:
    list = word_tokenize(review)
    postagging.append(nltk.pos_tag(list))

df['pos_tags'] = postagging

df.head()

Unnamed: 0,tweet,tweet_string,port_tweet,score,pos_tags
0,"[the, real, reason, why, you, be, sad, you, be...",the real reason why you be sad you be attach t...,real reason whi sad attach peopl distant pay a...,Neutral,"[(real, JJ), (reason, NN), (whi, NN), (sad, JJ..."
1,"[my, biggest, problem, be, overthinking, every...",my biggest problem be overthinking everything,biggest problem overthink everyth,Neutral,"[(biggest, JJS), (problem, NN), (overthink, NN..."
2,"[the, worst, sadness, be, the, sadness, you, h...",the worst sadness be the sadness you have teac...,worst sad sad teach hide,Neutral,"[(worst, RB), (sad, JJ), (sad, JJ), (teach, NN..."
3,"[i, can, not, make, you, understand, i, can, n...",i cannot make you understand i cannot make any...,make understand make anyon understand happen i...,Neutral,"[(make, NN), (understand, NN), (make, VB), (an..."
4,"[i, do, not, think, anyone, really, understand...",i do not think anyone really understand how ti...,think anyon realli understand tire act okay al...,Neutral,"[(think, VB), (anyon, JJ), (realli, NNS), (und..."


In [28]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def get_sentiment(word,tag):
    wn_tag = penn_to_wn(tag)
    
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []
    
    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []

    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

    pos=neg=obj=count=0

In [45]:
senti_score = []

for pos_val in df['pos_tags']:
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    for score in senti_val:
        try:
            pos = pos + score[1]  
            neg = neg + score[2]  
        except:
            continue
    senti_score.append(round((pos - neg),2))
    pos=neg=0    
    
df['senti_score'] = senti_score
print(df['senti_score'])

df.head()

0      -0.12
1      -0.50
2      -2.12
3       0.62
4       0.50
        ... 
3077   -0.50
3078   -0.88
3079    0.00
3080    0.75
3081   -0.75
Name: senti_score, Length: 3082, dtype: float64


Unnamed: 0,tweet,tweet_string,port_tweet,score,pos_tags,senti_score
0,"[the, real, reason, why, you, be, sad, you, be...",the real reason why you be sad you be attach t...,real reason whi sad attach peopl distant pay a...,Neutral,"[(real, JJ), (reason, NN), (whi, NN), (sad, JJ...",-0.12
1,"[my, biggest, problem, be, overthinking, every...",my biggest problem be overthinking everything,biggest problem overthink everyth,Neutral,"[(biggest, JJS), (problem, NN), (overthink, NN...",-0.5
2,"[the, worst, sadness, be, the, sadness, you, h...",the worst sadness be the sadness you have teac...,worst sad sad teach hide,Neutral,"[(worst, RB), (sad, JJ), (sad, JJ), (teach, NN...",-2.12
3,"[i, can, not, make, you, understand, i, can, n...",i cannot make you understand i cannot make any...,make understand make anyon understand happen i...,Neutral,"[(make, NN), (understand, NN), (make, VB), (an...",0.62
4,"[i, do, not, think, anyone, really, understand...",i do not think anyone really understand how ti...,think anyon realli understand tire act okay al...,Neutral,"[(think, VB), (anyon, JJ), (realli, NNS), (und...",0.5


In [46]:
df['senti_score'] = df['senti_score'].replace(0.00, 0)
df['senti_score'] = np.where(df['senti_score'] < 0, 'Negative', np.where(df['senti_score'] == 0, 'Neutral', 'Positive'))
df['senti_score'].value_counts()

senti_score
Neutral     1268
Negative    1036
Positive     778
Name: count, dtype: int64

#### TextBlob

In [49]:
from textblob import TextBlob

def sentiment_analysis(df):
    def getSubjectivity(text):
        return TextBlob(text).sentiment.subjectivity
    
    def getPolarity(text):
        return TextBlob(text).sentiment.polarity

    df['TextBlob_Subjectivity'] = df['tweet_string'].apply(getSubjectivity)
    df['TextBlob_Polarity'] = df['tweet_string'].apply(getPolarity)

    def getAnalysis(score):
        if score < 0:
            return 'Negative'
        elif score == 0:
            return 'Neutral'
        else:
            return 'Positive'
        
    df['TextBlob_Analysis'] = df['TextBlob_Polarity'].apply(getAnalysis )

    return df

sentiment_analysis(df)

Unnamed: 0,tweet,tweet_string,port_tweet,score,pos_tags,senti_score,TextBlob_Subjectivity,TextBlob_Polarity,TextBlob_Analysis
0,"[the, real, reason, why, you, be, sad, you, be...",the real reason why you be sad you be attach t...,real reason whi sad attach peopl distant pay a...,Neutral,"[(real, JJ), (reason, NN), (whi, NN), (sad, JJ...",Negative,0.403333,-0.093333,Negative
1,"[my, biggest, problem, be, overthinking, every...",my biggest problem be overthinking everything,biggest problem overthink everyth,Neutral,"[(biggest, JJS), (problem, NN), (overthink, NN...",Negative,0.000000,0.000000,Neutral
2,"[the, worst, sadness, be, the, sadness, you, h...",the worst sadness be the sadness you have teac...,worst sad sad teach hide,Neutral,"[(worst, RB), (sad, JJ), (sad, JJ), (teach, NN...",Negative,1.000000,-1.000000,Negative
3,"[i, can, not, make, you, understand, i, can, n...",i cannot make you understand i cannot make any...,make understand make anyon understand happen i...,Neutral,"[(make, NN), (understand, NN), (make, VB), (an...",Positive,0.000000,0.000000,Neutral
4,"[i, do, not, think, anyone, really, understand...",i do not think anyone really understand how ti...,think anyon realli understand tire act okay al...,Neutral,"[(think, VB), (anyon, JJ), (realli, NNS), (und...",Positive,0.477778,0.377778,Positive
...,...,...,...,...,...,...,...,...,...
3077,"[cough, sneeze, be, tho, worst]",cough sneeze be tho worst,cough sneez tho worst,Neutral,"[(cough, NN), (sneez, NN), (tho, NN), (worst, ...",Negative,1.000000,-1.000000,Negative
3078,"[i, can, be, your, sad, whore, ahaha]",i can be your sad whore ahaha,sad whore ahaha,Neutral,"[(sad, JJ), (whore, NN), (ahaha, NN)]",Negative,1.000000,-0.500000,Negative
3079,"[bro, that, feel, you, get, after, you, sneeze]",bro that feel you get after you sneeze,bro feel get sneez,Neutral,"[(bro, NN), (feel, VB), (get, NN), (sneez, NN)]",Neutral,0.000000,0.000000,Neutral
3080,"[long, piss, be, the, best]",long piss be the best,long piss best,Neutral,"[(long, RB), (piss, JJ), (best, JJS)]",Positive,0.350000,0.475000,Positive


In [50]:
df['TextBlob_Analysis'].value_counts()

TextBlob_Analysis
Neutral     1249
Positive     979
Negative     854
Name: count, dtype: int64