In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ravi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
df = pd.read_csv("Tweets.csv")

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent=SentimentIntensityAnalyzer()

In [4]:
df.drop(['tweet_id', 'airline_sentiment_confidence', 'negativereason',
       'negativereason_confidence', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],axis=1,inplace=True)

In [22]:
df.rename({"airline_sentiment":"Target","text":"data"},axis='columns',inplace=True)
df[["Target","data"]].head()

Unnamed: 0,Target,data
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [75]:
import re
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df["url_removed"]=df.data.apply(lambda x:remove_urls(x))
df["url_removed"].head()

0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
Name: url_removed, dtype: object

In [77]:
def emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
df["emoji"]=df.url_removed.apply(lambda x:emoji(x))
df["emoji"].head()

0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
Name: emoji, dtype: object

In [79]:
def numeric(text):
    return "".join([x for x in text if not x.isdigit()])
df["numeric"]=df.emoji.apply(lambda x:numeric(x))
df["numeric"].head()

0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
Name: numeric, dtype: object

In [89]:
from nltk.tokenize import word_tokenize
def cleaning(x):
    y=x.lower()
    return word_tokenize(y)
df["token"]=df.numeric.apply(lambda x:cleaning(x))
df["token"].head()

0       [@, virginamerica, what, @, dhepburn, said, .]
1    [@, virginamerica, plus, you, 've, added, comm...
2    [@, virginamerica, i, did, n't, today, ..., mu...
3    [@, virginamerica, it, 's, really, aggressive,...
4    [@, virginamerica, and, it, 's, a, really, big...
Name: token, dtype: object

In [119]:
import string
def pun_removal(text):
    no_pun=" ".join([c for c in text if c not in string.punctuation])
    return no_pun
df["punctuation"]=df.token.apply(lambda x:pun_removal(x))
df["punctuation"].head()

0                     virginamerica what dhepburn said
1    virginamerica plus you 've added commercials t...
2    virginamerica i did n't today ... must mean i ...
3    virginamerica it 's really aggressive to blast...
4    virginamerica and it 's a really big bad thing...
Name: punctuation, dtype: object

In [110]:
from nltk.corpus import stopwords
stop=set(stopwords.words("english"))
stop.update(("n't","'s","ca","since","ravi","cs","'ve","'ll","'m","still","us","..."))
def stopword(text):
    removed=[]
    for x in text.split():
        if x not in stop:
            removed.append(x)
    return removed
df["stopword"]=df.punctuation.apply(lambda x:stopword(x))
df["stopword"].head()

0                      [virginamerica, dhepburn, said]
1    [virginamerica, plus, added, commercials, expe...
2    [virginamerica, today, must, mean, need, take,...
3    [virginamerica, really, aggressive, blast, obn...
4             [virginamerica, really, big, bad, thing]
Name: stopword, dtype: object

In [113]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
def lema(text):
    w=WordNetLemmatizer()
    str=" ".join([w.lemmatize(x,wordnet.VERB) for x in text])
    return str    
df["lema"]=df.stopword.apply(lambda x:lema(x))
df.lema.head()

0                           virginamerica dhepburn say
1    virginamerica plus add commercials experience ...
2    virginamerica today must mean need take anothe...
3    virginamerica really aggressive blast obnoxiou...
4                   virginamerica really big bad thing
Name: lema, dtype: object

In [121]:
df["pure"]=df.lema.str.replace("“","")

In [123]:
df["pure"].replace({"mins":"minutes","hrs":"hours","ppl":"people"},regex=True,inplace=True)
df["pure"].head()

0                           virginamerica dhepburn say
1    virginamerica plus add commercials experience ...
2    virginamerica today must mean need take anothe...
3    virginamerica really aggressive blast obnoxiou...
4                   virginamerica really big bad thing
Name: pure, dtype: object

In [124]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent=SentimentIntensityAnalyzer()

In [142]:
def score(x):
    return sent.polarity_scores(x)
df["score"]=df.pure.apply(lambda x:score(x))
df["score"].head()

0    {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
1    {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
2    {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
3    {'neg': 0.287, 'neu': 0.557, 'pos': 0.156, 'co...
4    {'neg': 0.486, 'neu': 0.514, 'pos': 0.0, 'comp...
Name: score, dtype: object

In [147]:
df["compound"]=df.score.apply(lambda x:x["compound"])
df["compound"].head()

0    0.0000
1    0.0000
2    0.0000
3   -0.3306
4   -0.5829
Name: compound, dtype: float64

In [155]:
df["result"]=df["compound"].apply(lambda x:"positive" if x>0 else ("neutral" if x==0 else "negative"))
df["result"].head()

0     neutral
1     neutral
2     neutral
3    negative
4    negative
Name: result, dtype: object

In [157]:
df[["Target","result"]].head()

Unnamed: 0,Target,result
0,neutral,neutral
1,positive,neutral
2,neutral,neutral
3,negative,negative
4,negative,negative


In [160]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [165]:
print(accuracy_score(df.Target,df.result))
print(confusion_matrix(df.Target,df.result))
print(classification_report(df.Target,df.result))

0.5088114754098361
[[4095 1937 3146]
 [ 413 1288 1398]
 [  89  208 2066]]
              precision    recall  f1-score   support

    negative       0.89      0.45      0.59      9178
     neutral       0.38      0.42      0.39      3099
    positive       0.31      0.87      0.46      2363

    accuracy                           0.51     14640
   macro avg       0.53      0.58      0.48     14640
weighted avg       0.69      0.51      0.53     14640



### Sentiment analysis results around 50% of accuracy which is quite less when  