<a href="https://colab.research.google.com/github/TheNizzo/SentimentAnalysis/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

In [None]:
!pip install datasets
from datasets import load_dataset
dataset_train = load_dataset('imdb', split='train')
dataset_test = load_dataset('imdb', split='test')



Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [None]:
x_train, y_train, x_test, y_test = dataset_train[:]['text'], dataset_train[:]['label'], dataset_test[:]['text'], dataset_test[:]['label']
len(x_train)

25000

In [None]:
from tqdm import tqdm
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import re
nltk.download('punkt')

def stem(l):
  res = []
  re_word = re.compile(r"^\w+$")
  stemmer = SnowballStemmer("english")
  for text in tqdm(l, total=len(l)):
    res.append(" ".join([stemmer.stem(word) for word in word_tokenize(text.lower()) if re_word.match(word)]))
  return res

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
stemmed_train = stem(x_train)
stemmed_test = stem(x_test)

100%|██████████| 25000/25000 [02:08<00:00, 194.46it/s]
100%|██████████| 25000/25000 [02:05<00:00, 198.81it/s]


In [None]:
!python3 -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 2.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
import spacy

# loading the small English model
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer", "ner"])

In [None]:
def lemm(l):
  lemmas = []
  re_word = re.compile(r"^\w+$")
  for text in tqdm(l, total=len(l)):
    lemmas.append(' '.join([token.lemma_ for token in nlp(text.lower()) if re_word.match(token.text)]))
  return lemmas

In [None]:
lemmas_train = lemm(x_train)
lemmas_test = lemm(x_test)

100%|██████████| 25000/25000 [00:59<00:00, 420.48it/s]
100%|██████████| 25000/25000 [00:57<00:00, 435.30it/s]


# Logistic Regression

In [None]:
import pandas as pd

In [None]:
!wget https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt
lexicon = pd.read_csv("vader_lexicon.txt", sep="\t", names=['word', 'MEAN-SENTIMENT-RATING', 'a', 'b']).drop(['a', 'b'], axis = 'columns')
d = {}
for w, v in lexicon.iterrows():
    d[v[0]] = v[1]

--2021-10-01 07:04:13--  https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 426786 (417K) [text/plain]
Saving to: ‘vader_lexicon.txt.1’


2021-10-01 07:04:14 (5.24 MB/s) - ‘vader_lexicon.txt.1’ saved [426786/426786]



In [None]:
df_train_lemma = pd.DataFrame(list(zip(lemmas_train, y_train)), columns=['val', 'label'])
df_test_lemma = pd.DataFrame(list(zip(lemmas_test, y_test)), columns=['val', 'label'])

df_train_stem = pd.DataFrame(list(zip(stemmed_train, y_train)), columns=['val', 'label'])
df_test_stem = pd.DataFrame(list(zip(stemmed_test, y_test)), columns=['val', 'label'])

df_train = pd.DataFrame(list(zip(x_train, y_train)), columns=['val', 'label'])
df_test = pd.DataFrame(list(zip(x_test, y_test)), columns=['val', 'label'])
df_train.head(3), df_test.head(3)

(                                                 val  label
 0  Bromwell High is a cartoon comedy. It ran at t...      1
 1  Homelessness (or Houselessness as George Carli...      1
 2  Brilliant over-acting by Lesley Ann Warren. Be...      1,
                                                  val  label
 0  I went and saw this movie last night after bei...      1
 1  Actor turned director Bill Paxton follows up h...      1
 2  As a recreational golfer with some knowledge o...      1)

In [None]:
import numpy as np
def pos(l):
  pos = 0
  for w in l:
    if w in d and d[w] > 0.5:
      pos += 1
  return pos

def neg(l):
  neg = 0
  for w in l:
    if w in d and d[w] < 0.5:
      neg += 1
  return neg

def contains_no(l):
  return 1 if ("no" in list((map(lambda x: x.lower(),l)))) else 0

def first_second_pro(l):
  pronouns = ["i", "me", "my", "mine", "we", "us", "our", "ours", "you", "your",
              "yours"]
  return sum([list((map(lambda x: x.lower(),l))).count(j) for j in pronouns])

def get_features(df):
  split_df = df['val'].str.split("[ .,\"]")
  df['containsNO'] = split_df.apply(contains_no)
  df['containsExclamation'] = df['val'].apply(lambda x: 1 if "!" in x  else 0)
  df['count_pronouns'] = split_df.apply(first_second_pro)
  df["logNOfWords"] = np.log(df["val"].str.count(" "))
  df["pos_count"] = split_df.apply(pos)
  df["neg_count"] = split_df.apply(neg)

In [None]:
get_features(df_train)
get_features(df_test)

get_features(df_train_lemma)
get_features(df_test_lemma)

get_features(df_train_stem)
get_features(df_test_stem)
df_train

Unnamed: 0,val,label,containsNO,containsExclamation,count_pronouns,logNOfWords,pos_count,neg_count
0,Bromwell High is a cartoon comedy. It ran at t...,1,0,1,9,4.934474,1,2
1,Homelessness (or Houselessness as George Carli...,1,0,0,5,6.056784,22,14
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,0,0,2,4.983607,7,5
3,This is easily the most underrated film inn th...,1,0,0,2,4.812184,7,3
4,This is not the typical Mel Brooks film. It wa...,1,0,0,0,4.779123,7,1
...,...,...,...,...,...,...,...,...
24995,"Towards the end of the movie, I felt it was to...",0,0,0,13,5.631212,13,9
24996,This is the kind of movie that my enemies cont...,0,1,1,5,5.017280,10,8
24997,I saw 'Descent' last night at the Stockholm Fi...,0,1,1,13,5.662960,18,11
24998,Some films that you pick up for a pound turn o...,0,0,0,6,5.442418,10,8


In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(df_train[df_train.columns[~df_train.columns.isin(['val', 'label'])]], df_train['label'].values.reshape(-1, 1))

clf_lemma = LogisticRegression(random_state=0).fit(df_train_lemma[df_train_lemma.columns[~df_train_lemma.columns.isin(['val', 'label'])]], df_train_lemma['label'].values.reshape(-1, 1))

clf_stem = LogisticRegression(random_state=0).fit(df_train_stem[df_train_stem.columns[~df_train_stem.columns.isin(['val', 'label'])]], df_train_stem['label'].values.reshape(-1, 1))


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
print("x_test: ", clf.score(df_test[df_test.columns[~df_test.columns.isin(['val', 'label'])]], df_test['label'].values.reshape(-1, 1)))

print("x_test_lemma: ", clf_lemma.score(df_test_lemma[df_test_lemma.columns[~df_test_lemma.columns.isin(['val', 'label'])]], df_test_lemma['label'].values.reshape(-1, 1)))

print("x_test_stem: ", clf_stem.score(df_test_stem[df_test_stem.columns[~df_test_stem.columns.isin(['val', 'label'])]], df_test_stem['label'].values.reshape(-1, 1)))

x_test:  0.70704
x_test_lemma:  0.70524
x_test_stem:  0.67804


In [None]:
y_pred = clf.predict(df_test[df_test.columns[~df_test.columns.isin(['val', 'label'])]])

In [None]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(df_test['label'].values.reshape(-1, 1), y_pred)

(array([0.70928352, 0.70484407]),
 array([0.70168, 0.7124 ]),
 array([0.70546127, 0.70860189]),
 array([12500, 12500]))