## Unsupervised Learning and predicting Sentiment Analyses for Reviews of IMDB Dataset

Importing Library & EDA

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [2]:
from nltk.corpus import stopwords
import textblob
stop = stopwords.words('english')
from nltk.corpus import opinion_lexicon
pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())
from nltk.tokenize import treebank
tokenizer = treebank.TreebankWordTokenizer()
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from afinn import Afinn

In [3]:
import re
from bs4 import BeautifulSoup
import unicodedata
import spacy
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
ps = nltk.porter.PorterStemmer()
ls =nltk.stem.LancasterStemmer()
import requests 


contractions_dict = {
    'didn\'t': 'did not',
    'don\'t': 'do not',
    "aren't": "are not",
    "can't": "cannot",
    "cant": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "didnt": "did not",
    "doesn't": "does not",
    "doesnt": "does not",
    "don't": "do not",
    "dont" : "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i had",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'm": "i am",
    "im": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had",
    "she'd've": "she would have",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they had",
    "they'd've": "they would have",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who's": "who is",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
    }

contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)




def strip_html_tags():
    soup = BeautifulSoup(content,"html.parser")
    [s.extract for s in soup(['iframe','script'])]
    stripped_text = soup.get_text() 
    stripped_text=re.sub('[\r|\n|\r\n]+','\n',stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text

def remove_special_characters(text, remove_digits = False):
    patterns = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern,"",text)
    return text

def simple_stemmers(text,stemmer = ps):
    text = " ".join([stemmer.stem(word)for word in text.split()])
    return text

def expand_contraction(text):
    return contraction.fix(text)

def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ !='-PRON-' else word.text for word in text])

def remove_stopwords(text, is_lower_case = False, stopwords = None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens=[token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [tokens for token in tokens if token not in stopwords]
    else:
         filtered_tokens = [tokens for token in tokens if token.lower() not in stopwords]
    filtered_tokens = ' '.join(filtered_tokens)
    return filtered_tokens    



In [4]:
import tqdm

def text_pre_processor(text,html_strip=True, accented_char=True,contraction_expansion=True,text_lower_case=True,
                       text_stemming=False, text_lemmatization=True,special_char_removal=True, remove_digits=True,
                       stopword_removal=True, stopword_list=None): 
    #strip HTML
    if html_strip:
        text=strip_html_tags(text)
    
    #remove extra newlines(often might be present in really noisy text)
    text = text.translate(text.maketrans("\n\t\r"," "))

    #remove accented character
    if accented_char_removal:
        text = removal_accented_chars(text)
    
   #expand contraction
    if contraction_expansion:
         text = spacy_lemmatize_text(text)
    
   #Lemmatize text
    if text_lemmatization:
         text = spacy_lemmatize_text(text)
    
   #remove special characters and \or digits
    if special_char_removal:
   #insert space between special characters to isolate them
        special_char_pattern = re.compile(r'([{.(-)!}])')
        text = special_char_pattern.sub("\\1 ", text)
        text = remove_special_characters(text, remove_digits = remove_digits)
    
   #stem text
    if text_stemming and not text_lemmatization:
         text = simple_stemming(text)
    
   #lowercase the text
    if text_lower_case:
         text = text.lower()

   #remove stopwords
    if stopword_removal:
         text = remove_stopwords(text,is_lower_case = text_lower_case,stopwords=stopword_list)
    
   #remove extra whitespace
    text = re.sub(' +', ' ',text)
    text = text.strip()

    return text

def corpus_pre_processor(corpus):
    norm_corpus = []
    for doc in tqdm.tqdm(corpus):
        norm_corpus.append(text_pre_processor(doc))
    return norm_corpus

In [5]:

from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata


def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text
import re
contractions_dict = {
    'didn\'t': 'did not',
    'don\'t': 'do not',
    "aren't": "are not",
    "can't": "cannot",
    "cant": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "didnt": "did not",
    "doesn't": "does not",
    "doesnt": "does not",
    "don't": "do not",
    "dont" : "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i had",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'm": "i am",
    "im": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had",
    "she'd've": "she would have",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they had",
    "they'd've": "they would have",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who's": "who is",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
    }

contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)


def pre_process_corpus(docs):
    norm_docs = []
    for doc in tqdm.tqdm(docs):
        doc = strip_html_tags(doc)
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        doc = expand_contractions(doc)
        # lower case and remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()  
        norm_docs.append(doc)
  
    return norm_docs


import tqdm
def corpus_pre_processor(corpus):
    norm_corpus = []
    for doc in tqdm.tqdm(corpus):
        norm_corpus.append(normalize_document(doc))
    return norm_corpus
def normalize_corpus(doc):
    #Lowercase, remove special char \whitespace
    #remove stopwords
    #expand contraction
    words= word_tokenize(doc)
    doc = " ".join([word.lower() for word in words if word not in stop])
    doc = re.sub(r'[^a-zA-Z0-9\s]','',doc,re.I|re.A)
    doc =doc.strip()
    doc = expand_contractions(doc)
    return doc


In [6]:
import tqdm
def corpus_pre_processor(corpus):
    norm_corpus = []
    for doc in tqdm.tqdm(corpus):
        norm_corpus.append(normalize_document(doc))
    return norm_corpus
def normalize_corpus(doc):
    #Lowercase, remove special char \whitespace
    #remove stopwords
    #expand contraction
    words= word_tokenize(doc)
    doc = " ".join([word.lower() for word in words if word not in stop])
    doc = re.sub(r'[^a-zA-Z0-9\s]','',doc,re.I|re.A)
    doc =doc.strip()
    doc = expand_contractions(doc)
    return doc

Importing CSV file

In [None]:
dataset = pd.read_csv("D:\\sasi\\study\\ML\\ML project\\imdb-dataset-of-50k-movie-reviews\\IMDB Dataset.csv")

In [8]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
dataset['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [9]:
def normalize_document(doc):
    #Lowercase, remove special char \whitespace
    #remove stopwords
    #expand contraction
    words= word_tokenize(doc)
    doc = " ".join([word.lower() for word in words if word not in stop])
    doc = re.sub(r'[^a-zA-Z0-9\s]','',doc,re.I|re.A)
    doc =doc.strip()
    doc = expand_contractions(doc)
    return doc

In [10]:
dataset.reset_index(inplace = True)

In [11]:
#split the review into sentence
data=dataset[['review','index']]
data.rename(columns ={'index':'INDEX'},inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [12]:
data.head()

Unnamed: 0,review,INDEX
0,One of the other reviewers has mentioned that ...,0
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,2
3,Basically there's a family where a little boy ...,3
4,"Petter Mattei's ""Love in the Time of Money"" is...",4


In [12]:
from nltk.tokenize import sent_tokenize
data['split'] = data['review'].apply(sent_tokenize)

In [14]:
data.head()

Unnamed: 0,review,INDEX,split
0,One of the other reviewers has mentioned that ...,0,[One of the other reviewers has mentioned that...
1,A wonderful little production. <br /><br />The...,1,"[A wonderful little production., <br /><br />T..."
2,I thought this was a wonderful way to spend ti...,2,[I thought this was a wonderful way to spend t...
3,Basically there's a family where a little boy ...,3,[Basically there's a family where a little boy...
4,"Petter Mattei's ""Love in the Time of Money"" is...",4,"[Petter Mattei's ""Love in the Time of Money"" i..."


In [13]:
data_split=data.set_index('INDEX').split.apply(pd.Series).stack()

In [14]:
data_split=data.set_index('INDEX').split.apply(pd.Series).stack().reset_index(level=0).rename(columns={0:'Reviews'})
data_split.reset_index(level=0,inplace=True)
data_split.rename(columns={'INDEX':'review_no','index':'sentence'},inplace=True)

In [17]:
data_split.head(15)

Unnamed: 0,sentence,review_no,Reviews
0,0,0,One of the other reviewers has mentioned that ...
1,1,0,"They are right, as this is exactly what happen..."
2,2,0,"Trust me, this is not a show for the faint hea..."
3,3,0,This show pulls no punches with regards to dru...
4,4,0,"Its is hardcore, in the classic use of the wor..."
5,5,0,"It focuses mainly on Emerald City, an experime..."
6,6,0,"Em City is home to many..Aryans, Muslims, gang..."
7,7,0,Forget pretty pictures painted for mainstream ...
8,8,0,The first episode I ever saw struck me as so n...
9,9,0,"Not just violence, but injustice (crooked guar..."


In [15]:
#then normalizing the data
data_split['Reviews']=data_split['Reviews'].apply(normalize_document)

In [19]:
data_split.head()

Unnamed: 0,sentence,review_no,Reviews
0,0,0,one reviewers mentioned watching 1 oz episode ...
1,1,0,they right exactly happened me br br th...
2,2,0,trust show faint hearted ti amid
3,3,0,this show pulls punches regards drugs sex vio...
4,4,0,its hardcore classic use word br br it ...


## Sentiment Analysis using TextBlob

In [15]:
import textblob
textblob.TextBlob("I hate this film its not good").sentiment

Sentiment(polarity=-0.575, subjectivity=0.75)

In [16]:
def convert(data):
    if data == 2:
        return 'positive'
    if data == 1:
        return 'negative'
dataset['sentiment'] = dataset['sentiment'].apply(convert)

In [17]:
dataset['sentiment'].value_counts()

Series([], Name: sentiment, dtype: int64)

In [36]:
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])
sample = [353,699,900]

In [27]:
for review, sentiment in zip(reviews[sample],sentiments[sample]):
    print("REVIEW:",review)
    print("SENTIMENT:",sentiment)
    print('Predicted Sentiment polarity:',textblob.TextBlob(review).sentiment.polarity)
    print('-'*60)

REVIEW: This film held my interest enough to watch it several times. The plot has holes, but the lead performers make it work.<br /><br />Catherine Mary Stewart (Julia Kerbridge), does a great job as a woman of 37 who has sacrificed everything else to become a physician. She worked years to earn the money to go to medical school. She is performing brilliantly in her residency and is just about to take her board exam and realize her dream.<br /><br />Meanwhile, Julia's sister and brother-in-law are murdered and as the nearest living relative she is compelled to take in her niece Amanda (Arlen Aguayo-Stewart) to avoid having her become a ward of the state. Amanda is about 7 years old from her appearance. Amanda is so traumatized from her parent's murder that she has become mute. Needless to say, Julia's 16-hour days get longer caring for Amanda.<br /><br />Rob Lowe plays Kevin Finney, a charming neighbor man in their apartment building who works his way into the lives of Julia and Amanda

In [28]:
sentiment_polarity = [textblob.TextBlob(review).sentiment.polarity for review in reviews]
predicted_sentiments = ['positive' if score >= 0.1 else 'negative' for score in sentiment_polarity]

## Sentiment Analysis using AFINN

In [36]:
afn = Afinn(emoticons = True)

In [37]:
for review, sentiment in zip(reviews[sample],sentiments[sample]):
    print("REVIEW:",review)
    print("SENTIMENT:",sentiment)
    print('Predicted Sentiment polarity:',afn.score(review))
    print('-'*60)

REVIEW: This film held my interest enough to watch it several times. The plot has holes, but the lead performers make it work.<br /><br />Catherine Mary Stewart (Julia Kerbridge), does a great job as a woman of 37 who has sacrificed everything else to become a physician. She worked years to earn the money to go to medical school. She is performing brilliantly in her residency and is just about to take her board exam and realize her dream.<br /><br />Meanwhile, Julia's sister and brother-in-law are murdered and as the nearest living relative she is compelled to take in her niece Amanda (Arlen Aguayo-Stewart) to avoid having her become a ward of the state. Amanda is about 7 years old from her appearance. Amanda is so traumatized from her parent's murder that she has become mute. Needless to say, Julia's 16-hour days get longer caring for Amanda.<br /><br />Rob Lowe plays Kevin Finney, a charming neighbor man in their apartment building who works his way into the lives of Julia and Amanda

In [38]:
sentiment_polarity = [textblob.TextBlob(review).sentiment.polarity for review in reviews]
predicted_sentiments = ['positive' if score >= 0.2 else 'negative' for score in sentiment_polarity]

## Supervised Learning 

Predicting with Logistic Regression and Decision Tree models

In [14]:
import pandas as pd

dataset = pd.read_csv("D:\\sasi\\study\\ML\\ML project\\imdb-dataset-of-50k-movie-reviews\\IMDB Dataset.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null object
dtypes: object(2)
memory usage: 781.4+ KB


In [11]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
data_split=data.set_index('INDEX').split.apply(pd.Series).stack().reset_index(level=0).rename(columns={0:'Reviews'})
data_split.reset_index(level=0,inplace=True)
data_split.rename(columns={'INDEX':'review_no','index':'sentence'},inplace=True)

In [15]:
corpus = data_split['Reviews'].tolist()

In [16]:
#then normalizing the data
normalize_corpus=[]
normalize_corpus.append(corpus_pre_processor(corpus))

100%|████████████████████████████████████████████████████████████████████████| 536641/536641 [04:57<00:00, 1803.54it/s]


In [19]:
from sklearn.model_selection import train_test_split
train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(
    reviews,sentiments,test_size=1/3,random_state=0)

In [18]:
#dividing ito train and test
reviews= dataset['review'].values
sentiments = dataset['sentiment'].values

In [20]:
%%time
norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|███████████████████████████████████████████████████████████████████████████| 33333/33333 [00:52<00:00, 634.21it/s]
100%|███████████████████████████████████████████████████████████████████████████| 16667/16667 [00:27<00:00, 596.53it/s]

Wall time: 1min 20s





In [21]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

#biuld BOW
cv = CountVectorizer(binary = False, min_df = 5, max_df = 1.0 , ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)

Wall time: 41.8 s


In [22]:
cv_test_features = cv.transform(norm_test_reviews)

### Logistic Regression

In [32]:
%%time

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2',max_iter=500,C=1,solver='lbfgs')
lr.fit (cv_train_features,train_sentiments)

Wall time: 1min 23s




LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
lr_predictions = lr.predict(cv_test_features)

In [34]:
from sklearn.metrics import accuracy_score
accuracy_score(test_sentiments,lr_predictions)

0.9042

In [35]:
from sklearn.metrics import confusion_matrix,classification_report
labels = ['negative','positive']
print(classification_report(test_sentiments,lr_predictions))

              precision    recall  f1-score   support

    negative       0.90      0.90      0.90      7490
    positive       0.90      0.90      0.90      7510

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



In [36]:
labels = ['negative','positive']
pd.DataFrame(confusion_matrix(test_sentiments,lr_predictions),index = labels,columns = labels)


Unnamed: 0,negative,positive
negative,6769,721
positive,716,6794


For TFID

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
tv = TfidfVectorizer(use_idf =True, min_df=5,max_df=1.0,ngram_range=(1,2))

In [39]:
%%time
norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|███████████████████████████████████████████████████████████████████████████| 35000/35000 [00:53<00:00, 654.14it/s]
100%|███████████████████████████████████████████████████████████████████████████| 15000/15000 [00:22<00:00, 659.82it/s]

Wall time: 1min 16s





In [40]:
tv_train_features = tv.fit_transform(norm_train_reviews)
tv_test_features = tv.transform(norm_test_reviews)

In [41]:
lr.fit(tv_train_features,train_sentiments)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
lr_predictions = lr.predict(cv_test_features)

In [43]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['negative','positive']
print(classification_report(test_sentiments,lr_predictions))

              precision    recall  f1-score   support

    negative       0.91      0.76      0.83      7490
    positive       0.79      0.92      0.85      7510

    accuracy                           0.84     15000
   macro avg       0.85      0.84      0.84     15000
weighted avg       0.85      0.84      0.84     15000



In [44]:
labels = ['negative','positive']
pd.DataFrame(confusion_matrix(test_sentiments,lr_predictions),index = labels,columns = labels)

Unnamed: 0,negative,positive
negative,5684,1806
positive,581,6929


In [45]:
from sklearn.metrics import accuracy_score
accuracy_score(test_sentiments,lr_predictions)

0.8408666666666667

### Decision Tree

In [17]:
#dividing ito train and test
reviews= dataset['review'].values
sentiments = dataset['sentiment'].values

In [18]:
from sklearn.model_selection import train_test_split
train_reviewsDT, test_reviewsDT, train_sentimentsDT, test_sentimentsDT = train_test_split(
    reviews,sentiments,test_size=1/3,random_state=0)

In [None]:
train_reviewsDT=reviews[:35000]
train_sentimentsDT = sentiments[:35000]

test_reviewsDT = reviews[35000:]
test_sentimentsDT = sentiments[35000:]

In [19]:
%%time
norm_train_reviewsDT = pre_process_corpus(train_reviewsDT)
norm_test_reviewsDT = pre_process_corpus(test_reviewsDT)

100%|███████████████████████████████████████████████████████████████████████████| 33333/33333 [00:50<00:00, 660.97it/s]
100%|███████████████████████████████████████████████████████████████████████████| 16667/16667 [00:25<00:00, 655.01it/s]

Wall time: 1min 15s





In [20]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

#biuld BOW
cv = CountVectorizer(binary = False, min_df = 5, max_df = 1.0 , ngram_range=(1,2))
cv_train_featuresDT = cv.fit_transform(norm_train_reviewsDT)

Wall time: 44.9 s


In [21]:
cv_test_featuresDT = cv.transform(norm_test_reviewsDT)

In [22]:
from sklearn.tree import DecisionTreeClassifier
lr_D=DecisionTreeClassifier(criterion='entropy')

In [24]:
lr_D.fit(cv_train_featuresDT,train_sentimentsDT)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [25]:
lr_predictionsDT = lr_D.predict(cv_test_featuresDT)

In [27]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['negative','positive']
print(classification_report(test_sentimentsDT,lr_predictionsDT))

              precision    recall  f1-score   support

    negative       0.72      0.72      0.72      8367
    positive       0.71      0.71      0.71      8300

    accuracy                           0.72     16667
   macro avg       0.72      0.72      0.72     16667
weighted avg       0.72      0.72      0.72     16667



In [28]:
labels = ['negative','positive']
pd.DataFrame(confusion_matrix(test_sentimentsDT,lr_predictionsDT),index = labels,columns = labels)

Unnamed: 0,negative,positive
negative,5995,2372
positive,2370,5930


In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(test_sentimentsDT,lr_predictionsDT)

0.7154856902861942

for TFIFD

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(use_idf =True, min_df=5,max_df=1.0,ngram_range=(1,2))

In [31]:
%%time
norm_train_reviewsDT_T = pre_process_corpus(train_reviewsDT)
norm_test_reviewsDT_T = pre_process_corpus(test_reviewsDT)

100%|███████████████████████████████████████████████████████████████████████████| 33333/33333 [00:48<00:00, 681.43it/s]
100%|███████████████████████████████████████████████████████████████████████████| 16667/16667 [00:25<00:00, 649.89it/s]

Wall time: 1min 14s





In [32]:
tv_train_featuresDT = tv.fit_transform(norm_train_reviewsDT_T)
tv_test_featuresDT = tv.transform(norm_test_reviewsDT_T)

In [33]:
lr_D.fit(tv_train_featuresDT,train_sentimentsDT)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [35]:
lr_predictionsDT = lr_D.predict(tv_test_featuresDT)

In [36]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['negative','positive']
print(classification_report(test_sentimentsDT,lr_predictionsDT))

              precision    recall  f1-score   support

    negative       0.72      0.71      0.71      8367
    positive       0.71      0.72      0.71      8300

    accuracy                           0.71     16667
   macro avg       0.71      0.71      0.71     16667
weighted avg       0.71      0.71      0.71     16667



In [37]:
labels = ['negative','positive']
pd.DataFrame(confusion_matrix(test_sentimentsDT,lr_predictionsDT),index = labels,columns = labels)

Unnamed: 0,negative,positive
negative,5904,2463
positive,2323,5977


In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(test_sentimentsDT,lr_predictionsDT)

0.7128457430851383