In [27]:
import sqlalchemy
import configparser
import pandas as pd
import numpy as np
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt

In [28]:
config = configparser.ConfigParser()
config_file = '../config.ini'
config.read(config_file)
default = config['DEFAULT-SQLALCHEMY']
engine = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                            format(default['DB_USER'], default['DB_PASSWORD'], 
                                                    default['DB_IP'], default['DB_DATABASE']))

In [29]:
labelled_df = pd.read_csv('clean_d_tweets.csv')
df = pd.DataFrame(labelled_df['tweet'])
df['tweet'] = df['tweet'].astype(str)
df.dtypes

tweet    object
dtype: object

#### Tokenization

In [30]:
from nltk.tokenize import word_tokenize

df['tweet'] = df['tweet'].apply(lambda x: word_tokenize(x))

df.head()

Unnamed: 0,tweet
0,"[the, real, reason, why, you, be, sad, you, be..."
1,"[my, biggest, problem, be, overthinking, every..."
2,"[the, worst, sadness, be, the, sadness, you, h..."
3,"[i, can, not, make, you, understand, i, can, n..."
4,"[i, do, not, think, anyone, really, understand..."


#### Stemming

In [31]:
from nltk.stem import PorterStemmer

porterstemmer = PorterStemmer()
df['port_tweet'] = df['tweet'].apply(lambda x: [porterstemmer.stem(word) for word in x])

df.head()

Unnamed: 0,tweet,port_tweet
0,"[the, real, reason, why, you, be, sad, you, be...","[the, real, reason, whi, you, be, sad, you, be..."
1,"[my, biggest, problem, be, overthinking, every...","[my, biggest, problem, be, overthink, everyth]"
2,"[the, worst, sadness, be, the, sadness, you, h...","[the, worst, sad, be, the, sad, you, have, tea..."
3,"[i, can, not, make, you, understand, i, can, n...","[i, can, not, make, you, understand, i, can, n..."
4,"[i, do, not, think, anyone, really, understand...","[i, do, not, think, anyon, realli, understand,..."


#### Removal of Stop words

In [32]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

df['port_tweet'] = df['port_tweet'].apply(remove_stopwords)

df.head()

Unnamed: 0,tweet,port_tweet
0,"[the, real, reason, why, you, be, sad, you, be...","[real, reason, whi, sad, attach, peopl, distan..."
1,"[my, biggest, problem, be, overthinking, every...","[biggest, problem, overthink, everyth]"
2,"[the, worst, sadness, be, the, sadness, you, h...","[worst, sad, sad, teach, hide]"
3,"[i, can, not, make, you, understand, i, can, n...","[make, understand, make, anyon, understand, ha..."
4,"[i, do, not, think, anyone, really, understand...","[think, anyon, realli, understand, tire, act, ..."


#### VADER

In [33]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sentiment_analyzer = SentimentIntensityAnalyzer()
df['score'] = df['port_tweet'].apply(lambda x: sentiment_analyzer.polarity_scores(x)['compound'])

df['score'] = np.where(df['score'] <= 0, 'Negative', 'Positive')
df['score'].value_counts()

score
Negative    3055
Positive      27
Name: count, dtype: int64

#### SentiWordNet

In [34]:
def make_sentences(data,name):
    data[name]=data[name].apply(lambda x:' '.join([i+' ' for i in x]))
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

In [35]:
make_sentences(df,'port_tweet')

In [36]:
pos=neg=obj=count=0

postagging = []

for review in df['port_tweet']:
    list = word_tokenize(review)
    postagging.append(nltk.pos_tag(list))

df['pos_tags'] = postagging

df.head()

Unnamed: 0,tweet,port_tweet,score,pos_tags
0,"[the, real, reason, why, you, be, sad, you, be...",real reason whi sad attach peopl distant pay a...,Negative,"[(real, JJ), (reason, NN), (whi, NN), (sad, JJ..."
1,"[my, biggest, problem, be, overthinking, every...",biggest problem overthink everyth,Negative,"[(biggest, JJS), (problem, NN), (overthink, NN..."
2,"[the, worst, sadness, be, the, sadness, you, h...",worst sad sad teach hide,Negative,"[(worst, RB), (sad, JJ), (sad, JJ), (teach, NN..."
3,"[i, can, not, make, you, understand, i, can, n...",make understand make anyon understand happen i...,Negative,"[(make, NN), (understand, NN), (make, VB), (an..."
4,"[i, do, not, think, anyone, really, understand...",think anyon realli understand tire act okay al...,Negative,"[(think, VB), (anyon, JJ), (realli, NNS), (und..."


In [38]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def get_sentiment(word,tag):
    wn_tag = penn_to_wn(tag)
    
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []
    
    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []

    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

    pos=neg=obj=count=0

In [39]:
senti_score = []

for pos_val in df['pos_tags']:
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    for score in senti_val:
        try:
            pos = pos + score[1]  
            neg = neg + score[2]  
        except:
            continue
    senti_score.append(pos - neg)
    pos=neg=0    
    
df['senti_score'] = senti_score
print(df['senti_score'])

df.head()

0      -0.125
1      -0.500
2      -2.125
3       0.625
4       0.500
        ...  
3077   -0.500
3078   -0.875
3079    0.000
3080    0.750
3081   -0.750
Name: senti_score, Length: 3082, dtype: float64


Unnamed: 0,tweet,port_tweet,score,pos_tags,senti_score
0,"[the, real, reason, why, you, be, sad, you, be...",real reason whi sad attach peopl distant pay a...,Negative,"[(real, JJ), (reason, NN), (whi, NN), (sad, JJ...",-0.125
1,"[my, biggest, problem, be, overthinking, every...",biggest problem overthink everyth,Negative,"[(biggest, JJS), (problem, NN), (overthink, NN...",-0.5
2,"[the, worst, sadness, be, the, sadness, you, h...",worst sad sad teach hide,Negative,"[(worst, RB), (sad, JJ), (sad, JJ), (teach, NN...",-2.125
3,"[i, can, not, make, you, understand, i, can, n...",make understand make anyon understand happen i...,Negative,"[(make, NN), (understand, NN), (make, VB), (an...",0.625
4,"[i, do, not, think, anyone, really, understand...",think anyon realli understand tire act okay al...,Negative,"[(think, VB), (anyon, JJ), (realli, NNS), (und...",0.5


In [40]:
df['senti_score'] = np.where(df['senti_score'] <= 0, 'Negative', 'Positive')
df['senti_score'].value_counts()

senti_score
Negative    2304
Positive     778
Name: count, dtype: int64