# Predict sentiment score of moview review with Logistic Regression

## 1.Import data & Basic library

In [212]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set()

In [0]:
df = pd.read_csv('/content/drive/My Drive/FTMLE - Tonga/Data/movie_review.csv',encoding='utf-8', sep='\t')
ev = pd.read_csv('/content/drive/My Drive/FTMLE - Tonga/Data/movie_review_evaluation.csv',encoding='utf-8', sep='\t')

In [215]:
# Check sample of the dataset
df.sample(5)

Unnamed: 0,id,review,sentiment
9374,6265_7,Nick and Kelly are ready to be married but Tra...,1
1366,7727_3,I am commenting on this miniseries from the pe...,0
19843,10429_10,I'm not usually a fan of strictly romantic mov...,1
3186,12433_8,"This is a good movie, although people unfamili...",1
2766,5774_1,This movie is a modest effort by Spike Lee. He...,0


## 2.Overview of the data





In [216]:
# Check content of 1 comment for the meaning of sentiment
# 0 means negative ; 1 means positive
df.loc[2196,'review']

"This was awful. Andie Macdowell is a terrible actress. So wooden she makes a rocking horse look like it could do a better job. But then remember that turn in Four Weddings, equally as excruciating. Another film that portrays England as full of Chocolate box cottages, and village greens. I mean that school, how many schools apart from maybe Hogwarts look like that? The twee police station looked like the set from Heartbeat ( a nauseating British series set in the 60s).This film just couldn't make its mind up what it wanted to be- a comedy or a serious examination of the undercurrents in women's friendships. If it had stuck to the former then the graveyard sex scenes and the highly stupid storming of the wedding might just have worked( i say just). But those scenes just didn't work with the tragedy in the second half. I also find it implausible that Kate would ever speak to Molly again after her terrible behaviour. A final note- what is a decent actress like Staunton doing in this pile 

In [217]:
df.info()
# There is no missing data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22500 entries, 0 to 22499
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         22500 non-null  object
 1   review     22500 non-null  object
 2   sentiment  22500 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 527.5+ KB


In [218]:
df['sentiment'].value_counts(normalize= True)

# There is only 2 tye of sentiment, 0 and 1. 
# 0 means negative ; 1 means positive

1    0.501244
0    0.498756
Name: sentiment, dtype: float64

In [0]:
# Check for duplicate id ->  No duplicate found
df['id'].count() == df['id'].nunique()

# It does not support our prediction model so we can remove it 
df.drop(labels='id',axis=1,inplace = True)

In [220]:
df

Unnamed: 0,review,sentiment
0,With all this stuff going down at the moment w...,1
1,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,The film starts with a manager (Nicholas Bell)...,0
3,It must be assumed that those who praised this...,0
4,Superbly trashy and wondrously unpretentious 8...,1
...,...,...
22495,It seems like more consideration has gone into...,0
22496,I don't believe they made this film. Completel...,0
22497,"Guy is a loser. Can't get girls, needs to buil...",0
22498,This 30 minute documentary BuÃ±uel made in the...,0


In [221]:
# Check for duplicate review -> there are duplicated review (could be fake reviews)
# We will check some sample of duplicate review
df['review'].count() == df['review'].nunique()


False

In [222]:
# Sort the df base Columns review & Separate the duplicated review to check
df.sort_values('review', inplace = True)
df

Unnamed: 0,review,sentiment
7381,A Turkish Bath sequence in a film noir loc...,1
318,!!!! POSSIBLE MILD SPOILER !!!!!<br /><br />As...,0
5741,!!!!! OF COURSE THERE'S SPOILERS !!!!! I'm sur...,0
7836,!!!!! POSSIBLE SPOILER !!!!!<br /><br />You`d ...,0
378,### Spoilers! ### <br /><br />What is this mov...,0
...,...,...
3403,you must be seeing my comments over many films...,0
19068,zero day is based of columbine high school mas...,1
17180,{Possible spoilers coming up... you've been fo...,1
10232,{rant start} I didn't want to believe them at ...,0


In [0]:
check = df[df['review'].duplicated()]

In [224]:
check
# Look like therer is no duplicate review

Unnamed: 0,review,sentiment
13252,'Dead Letter Office' is a low-budget film abou...,0
6311,".......Playing Kaddiddlehopper, Col San Fernan...",1
19066,"<br /><br />Back in his youth, the old man had...",0
6658,A have a female friend who is currently being ...,1
4441,"A longtime fan of Bette Midler, I must say her...",1
...,...,...
2783,"Wow, here it finally is; the action \movie\"" w...",0
20989,You do realize that you've been watching the E...,0
2415,"in this movie, joe pesci slams dunks a basketb...",0
20020,it's amazing that so many people that i know h...,1


# Prediction model

## Step 1: Data Cleaning

In [225]:
# The special character could be recognize as part of the word & confuse our model
# So we need to clean these special characters (astrophoe,the comma,..)
df.sample(5)

Unnamed: 0,review,sentiment
8550,"Acidic, unremitting, and beautiful, John Schle...",1
14802,Tony Hawk Underground came at a point where th...,1
12637,"This movie is fun to watch , doesnt have much ...",1
1731,This is the Neil Simon piece of work that got ...,1
5686,When I was kid back in the 1970s a local theat...,0


In [0]:
# Overall look at the most common words
from collections import Counter
vocab = Counter()

for document in df['review']:
  for word in document.split(' '):
    vocab[word] += 1

In [227]:
vocab.most_common(20)

[('the', 258519),
 ('a', 139707),
 ('and', 137397),
 ('of', 128750),
 ('to', 119278),
 ('is', 92935),
 ('in', 77245),
 ('I', 59255),
 ('that', 57991),
 ('this', 51379),
 ('it', 48865),
 ('/><br', 45851),
 ('was', 42004),
 ('as', 38288),
 ('with', 37496),
 ('for', 36919),
 ('The', 30399),
 ('but', 30350),
 ('on', 27738),
 ('movie', 27342)]

In [228]:
# Initiate stop word on current vocab so check for error
from nltk.corpus import stopwords

stop = stopwords.words('english')
vocab_reduced = Counter()

for word, count in vocab.items():
  if not word in stop:
    vocab_reduced[word] = count

vocab_reduced.most_common(10)

[('I', 59255),
 ('/><br', 45851),
 ('The', 30399),
 ('movie', 27342),
 ('film', 24768),
 ('one', 18704),
 ('like', 16278),
 ('This', 11074),
 ('would', 10720),
 ('good', 10243)]

In [229]:
# download stop word 
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Create tokennizer function
from nltk.stem import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


## Step 2 : Representation

In [0]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer


# #Tokenize the stop word
# stop = tokenizer_stop(stop)

def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

In [232]:
# Test our processing funtion on 1 random data
print(df.loc[8333,'review'])

preprocessor(df.loc[8333,'review'])

To \Bend It Like Beckham\" may not mean much to us Americans who know very little about the other football (soccer), but to English sports fans, it is equivalent to \"Hit it Like Bonds\" or \"Dunk it Like Jordan.\" Any young soccer player dreams of bending a soccer ball around one player and into the net for a goal, much like star player David Beckham does, much like the young Indian girl, Jess (Parminder Nagra), does in the film Bend It Like Beckham. Jess loves to play pick up soccer games, the kind forbidden by her traditionalist mother. However, while playing one day, a passing friend named Jules (Keira Knightley) sees her play and invites her to try out for a traveling, all girls soccer team. After satisfying the coach Joe (Jonathan Rhys-Meyers), she makes the team, something she knows her mother would not approve of. The movie is not about disobeying parents, but rather a girl doing what she wants to do, even if that goes against culture, not just the parents. There is humor throw

'to bend it like beckham may not mean much to us americans who know very little about the other football soccer but to english sports fans it is equivalent to hit it like bonds or dunk it like jordan any young soccer player dreams of bending a soccer ball around one player and into the net for a goal much like star player david beckham does much like the young indian girl jess parminder nagra does in the film bend it like beckham jess loves to play pick up soccer games the kind forbidden by her traditionalist mother however while playing one day a passing friend named jules keira knightley sees her play and invites her to try out for a traveling all girls soccer team after satisfying the coach joe jonathan rhys meyers she makes the team something she knows her mother would not approve of the movie is not about disobeying parents but rather a girl doing what she wants to do even if that goes against culture not just the parents there is humor thrown throughout the movie especially when 

## Step 3: Classification

In [0]:
# split the dataset in train and test

from sklearn.model_selection import train_test_split

X = df['review']

y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2)

In [234]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# ut the process into the pipeline

clf = Pipeline([('vect', tfidf),
                ('clf', LogisticRegression(random_state=0))])

clf.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=<function preprocessor at 0x7fe12e8d79d8>,
                                 smooth_idf=True,
                                 stop_words=['i', 'me', 'my', 'myself', '...
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenizer_porter at 0x7fe12e7e0d90>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
         

In [253]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Now apply those above metrics to evaluate your model

predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)

0.0

In [236]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.91      0.88      0.89      2242
           1       0.88      0.91      0.90      2258

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500



In [0]:
# Predict the sentiment of evaluation set
tuananh_pred = clf.predict(ev['review'])

In [0]:
# Export the result
pred_result = pd.DataFrame(data=tuananh_pred)
pred_result.to_csv('tuananh.csv')

## Model testing

In [240]:
review_test = [
    "Not the worst spoof movie that's been made, but it is a big disappointment.",
    "I love this but would not recommend the movie for children",
    ":) this movie sucks so badly",
]

preds = clf.predict_proba(review_test)

for i in range(len(review_test)):
    print(f'{i} --> Negative, Positive = {preds[i]}')

0 --> Negative, Positive = [0.99111058 0.00888942]
1 --> Negative, Positive = [0.16310394 0.83689606]
2 --> Negative, Positive = [0.82644859 0.17355141]


# Logistic Regression with Spacy

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string
import re
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

In [0]:
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]

class CleanTextTransformer(TransformerMixin):
   def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
   def fit(self, X, y=None, **fit_params):
        return self
def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text
def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

In [0]:
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC()

pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

In [268]:
# train
pipe.fit(X_train, y_train)

# test
preds = pipe.predict(X_test)

print("accuracy:", accuracy_score(y_test, preds))



accuracy: 0.8617777777777778


## Reference:
[Towards Data Science: Logistic Regression with Spacy](https://towardsdatascience.com/machine-learning-for-text-classification-using-spacy-in-python-b276b4051a49)



