# Training a sentiment analysis model

In [142]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import time, string

In [6]:
reviews = [[file.split("/")[0], file] for file in movie_reviews.fileids()]
df = pd.DataFrame(reviews, columns=["label", "fileid"])

In [7]:
stopSet = set(stopwords.words("english"))

In [372]:
punc = set(list(string.punctuation) + ["\n", ""])
cleanword = lambda word: (word not in punc) and (word.isalpha())
def preprocess(words, string=False):
    if(type(words) == str):
        words = words.split(" ")
    clean_mess = [w.lower() for w in words if w not in stopSet and cleanword(w)]
    if(string):
        return " ".join(clean_mess)
    else:
        return clean_mess


In [130]:
print(len(movie_reviews.words("neg/cv000_29416.txt")))
len(preprocess(movie_reviews.words("neg/cv000_29416.txt")).split())

879


324

## Evaluating tf-idf

In [389]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [327]:
%%time

bow_transformer = CountVectorizer(analyzer=preprocess).fit(df.fileid.apply(movie_reviews.raw))
print(len(bow_transformer.vocabulary_))

reviews_bow = bow_transformer.transform(df.fileid.apply(movie_reviews.raw))
print(reviews_bow.shape)

tfidf_transformer = TfidfTransformer().fit(reviews_bow)
reviews_tfidf = tfidf_transformer.transform(reviews_bow)

sentiment_detect_model = MultinomialNB().fit(reviews_tfidf, df['label'])

37360
(2000, 37360)
CPU times: total: 2.09 s
Wall time: 2.08 s


In [520]:
msg_train, msg_test, label_train, label_test = train_test_split(df['fileid'], df['label'], test_size=0.25, random_state=101)

In [521]:
%%time
pipeline = Pipeline([('bow', CountVectorizer(analyzer=preprocess)), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB())])

pipeline.fit(msg_train.apply(movie_reviews.raw), label_train)
predictions = pipeline.predict(msg_test.apply(movie_reviews.raw))

print (classification_report(label_test, predictions))

              precision    recall  f1-score   support

         neg       0.83      0.85      0.84       241
         pos       0.86      0.83      0.85       259

    accuracy                           0.84       500
   macro avg       0.84      0.84      0.84       500
weighted avg       0.84      0.84      0.84       500

CPU times: total: 1.22 s
Wall time: 1.22 s


In [385]:
yelp = pd.read_csv("yelp.csv")
yelp['label'] = yelp.stars.apply(lambda x: 'pos' if x >= 3 else 'neg')

In [386]:
ypipeline = Pipeline([('bow', CountVectorizer(analyzer=preprocess)), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB())])
ypipeline.fit(yelp.text, yelp['label'])
predictions = ypipeline.predict(yelp.text)

In [387]:
print (classification_report(yelp['label'], predictions))
print(confusion_matrix(yelp['label'], predictions))

              precision    recall  f1-score   support

         neg       1.00      0.00      0.01      1676
         pos       0.83      1.00      0.91      8324

    accuracy                           0.83     10000
   macro avg       0.92      0.50      0.46     10000
weighted avg       0.86      0.83      0.76     10000

[[   5 1671]
 [   0 8324]]


In [523]:
tesla = pd.read_csv('reuters_tesla.csv')

In [529]:
tesla['label'] = pipeline.predict(tesla.article)

## Other model?

In [131]:
%%time
positive_words = preprocess(nltk.corpus.movie_reviews.words(categories=["pos"]), string=False)
negative_words = preprocess(nltk.corpus.movie_reviews.words(categories=["neg"]), string=False)
print(f"postive: {len(positive_words)} \nnegative: {len(negative_words)}")

postive: 371560 
negative: 329983
CPU times: total: 2.25 s
Wall time: 2.21 s


In [96]:
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

In [138]:
%%time
common_set = set(positive_words).intersection(set(negative_words))
for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

CPU times: total: 93.8 ms
Wall time: 93 ms


## Evaluating pretrained model

In [557]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as Vader
# sia = SentimentIntensityAnalyzer()
vader = Vader()
# example = movie_reviews.words("neg/cv000_29416.txt")
# sn = preprocess(example)
# sia.polarity_scores(sn)

In [560]:
df['compound'] = df.fileid.apply(lambda x: sia.polarity_scores(movie_reviews.raw(x))['compound'])
df["compound_alpha"] = df["compound"].apply(lambda x: 'pos' if x > 0 else 'neg')
df.groupby("label").describe().T

Unnamed: 0,label,neg,pos
compound,count,1000.0,1000.0
compound,mean,0.10711,0.646057
compound,std,0.90413,0.702314
compound,min,-0.9997,-0.9996
compound,25%,-0.95455,0.9029
compound,50%,0.68205,0.9909
compound,75%,0.982025,0.997225
compound,max,0.9996,0.9999


In [566]:
df[(df["compound"] < -0.9) & (df["label"] == 'pos')]

Unnamed: 0,label,fileid,compound,compound_alpha
1009,pos,pos/cv009_29592.txt,-0.9813,neg
1013,pos,pos/cv013_10159.txt,-0.9159,neg
1028,pos,pos/cv028_26746.txt,-0.9916,neg
1046,pos,pos/cv046_10188.txt,-0.9936,neg
1050,pos,pos/cv050_11175.txt,-0.9796,neg
...,...,...,...,...
1936,pos,pos/cv936_15954.txt,-0.9897,neg
1939,pos,pos/cv939_10583.txt,-0.9897,neg
1942,pos,pos/cv942_17082.txt,-0.9808,neg
1960,pos,pos/cv960_29007.txt,-0.9816,neg


In [567]:
movie_reviews.raw("pos/cv009_29592.txt")

"the american action film has been slowly drowning to death in a sea of asian wire-fu copycats . \nit's not a pretty death , and it's leaving the likes of schwartznager , stallone , and van damme wearing cement galoshes at the bottom of a kung fu sea . \nsometimes , the mix results in a mind-blowing spectacle unlike any other . \nquality action with amazing and exciting stunt work , as in 1999's the matrix , can be a real gem . \nbut too often hollywood gets it wrong , even when they pay off chinese directors . \nflying ninjas and floating karate masters have been replaced by soaring bronx detectives and slow motion kicking scientists . \nmostly it's laughable . \nin hollywood's rush to emulate the success of the matrix , trademark asian stunt choreography has become more of a joke than an art form . \nbut iron monkey , the latest asian import , shows us how to get it right . \niron monkey ( actually a reissue of a 1993 film ) is the story of a 19th chinese vigilante ( rongguang yu ) ,

In [568]:
sia

<nltk.sentiment.vader.SentimentIntensityAnalyzer at 0x3e32ae50>

In [542]:
tesla['compound_score'] = tesla.article.apply(lambda x: sia.polarity_scores(preprocess(x, string=True))['compound'])

In [547]:
tesla.head(20)[['title', 'label', 'compound_score']]

Unnamed: 0,title,label,compound_score
0,Fisker to sell electric SUV in India with view...,neg,0.98
1,"GM, Hertz make deal to deploy up to 175,000 EVs",neg,0.967
2,Tesla countersues California agency behind rac...,pos,-0.8807
3,"LG Energy inks cobalt, lithium supply deals wi...",neg,0.9538
4,Wall Street ends down for third day as growth ...,pos,0.9678
5,Tesla recalls nearly 1.1 million U.S. vehicles...,neg,0.91
6,Renault ahead of schedule on EV partnerships -...,neg,0.9666
7,California regulator sees 2035 EV mandate as '...,neg,0.9834
8,China's Xpeng says CATL is no longer its large...,neg,0.9633
9,Analysis: Rally in U.S. consumer stocks teeter...,pos,0.9789


In [540]:
tesla.article.apply(lambda x: sia.polarity_scores(preprocess(x, string=True))['compound'])

0     0.9800
1     0.9670
2    -0.8807
3     0.9538
4     0.9678
5     0.9100
6     0.9666
7     0.9834
8     0.9633
9     0.9789
10    0.3612
11    0.9936
12    0.5574
13    0.9304
14    0.8979
15   -0.8402
16    0.9981
17    0.9382
18    0.4019
Name: article, dtype: float64

In [230]:
from sklearn.metrics import classification_report
print (classification_report(df['label'], df["compound_alpha"]))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55      1000
         pos       0.60      0.83      0.69      1000

    accuracy                           0.64      2000
   macro avg       0.66      0.64      0.62      2000
weighted avg       0.66      0.64      0.62      2000

