## Movie Rating Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [50]:
dfx=pd.read_csv("./Train.csv")

In [3]:
dfx.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [3]:
dfx.shape

(40000, 2)

In [4]:
df_test=pd.read_csv("./Test.csv")

In [5]:
X_test=df_test.values[:,0]

In [8]:
df_test.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


In [9]:
df_test.shape

(10000, 1)

In [10]:
np.unique(dfx.label)

array(['neg', 'pos'], dtype=object)

In [11]:
dfx.groupby("label").size()

label
neg    19989
pos    20011
dtype: int64

In [6]:
X_train=dfx.values[:,0]
Y_train=dfx.values[:,1]

In [18]:
print(X_train[1])

http://video.google.com/videoplay?docid=211772166650071408&hl=en Distribution was tried.<br /><br />We opted for mass appeal.<br /><br />We want the best possible viewing range so, we forgo profit and continue our manual labor jobs gladly to entertain you for working yours.<br /><br />View Texas tale, please write about it... If you like it or not, if you like Alex or not, if you like Stuie, Texas or Texas tale... Just write about it.<br /><br />Your opinion rules.


In [54]:
print(X_train.shape)
print(Y_train.shape)

(40000,)
(40000,)


### 1. Converting String into numbers of Y_train

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le=LabelEncoder()

In [9]:
Y_train=le.fit_transform(np.array(Y_train))
print(Y_train)

[1 1 1 ... 0 1 1]


In [10]:
print(le.inverse_transform(Y_train))

['pos' 'pos' 'pos' ... 'neg' 'pos' 'pos']


### 2. Cleaning the data

In [61]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [68]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [69]:
print(en_stopwords)

{'have', 'in', 'your', 'what', 'who', 'we', 'this', 'd', 'over', 'other', "you'll", 'themselves', 'just', 'and', 'how', 'itself', 'been', "it's", 'of', 'shan', 'here', 'mightn', 'there', 'ours', "wasn't", 'ma', 'when', 'few', 'to', 'they', 'only', 'ourselves', 'i', 'won', 'it', 've', 'hadn', 'against', 'yours', 'are', 'down', 'or', 'having', 'being', 'until', 'on', 'doing', 'should', "hasn't", "won't", 'yourselves', 'him', 'herself', 'were', 'again', "isn't", 'haven', 'off', 'where', 'wouldn', 'myself', "that'll", 'above', 'his', 'own', 'with', 'o', 'from', "mightn't", 'hers', "needn't", 'our', 'for', "she's", 'up', 'all', 'why', 'needn', 'each', 'them', 'both', 'be', "didn't", 'by', 'ain', 'during', "couldn't", 'he', "you're", 'do', "doesn't", 'as', "haven't", 'their', 'don', 'himself', 'a', 'but', 'about', 'an', 'had', 'no', "you'd", 'so', 'isn', 'can', 'under', 'aren', 'she', "mustn't", 'if', "you've", 'her', 'does', 'these', 'am', 'doesn', 'into', "wouldn't", 'than', 'not', 'should

In [70]:
# Removing not from stopwords so that it can be used for generating better review
# Ex: Not good - will be treated as false
# Ex: Not bad - will be treated as true
# so bigram will be used for this purpose

en_stopwords.remove("no")
en_stopwords.remove("not")
print(en_stopwords)

{'have', 'in', 'your', 'what', 'who', 'we', 'this', 'd', 'over', 'other', "you'll", 'themselves', 'just', 'and', 'how', 'itself', 'been', "it's", 'of', 'shan', 'here', 'mightn', 'there', 'ours', "wasn't", 'ma', 'when', 'few', 'to', 'they', 'only', 'ourselves', 'i', 'won', 'it', 've', 'hadn', 'against', 'yours', 'are', 'down', 'or', 'having', 'being', 'until', 'on', 'doing', 'should', "hasn't", "won't", 'yourselves', 'him', 'herself', 'were', 'again', "isn't", 'haven', 'off', 'where', 'wouldn', 'myself', "that'll", 'above', 'his', 'own', 'with', 'o', 'from', "mightn't", 'hers', "needn't", 'our', 'for', "she's", 'up', 'all', 'why', 'needn', 'each', 'them', 'both', 'be', "didn't", 'by', 'ain', 'during', "couldn't", 'he', "you're", 'do', "doesn't", 'as', "haven't", 'their', 'don', 'himself', 'a', 'but', 'about', 'an', 'had', "you'd", 'so', 'isn', 'can', 'under', 'aren', 'she', "mustn't", 'if', "you've", 'her', 'does', 'these', 'am', 'doesn', 'into', "wouldn't", 'than', 'shouldn', 'between'

In [71]:
import re   # used for regular expression or you can use nltk RegexpTokenizer
from bs4 import BeautifulSoup as bs     # used to remove HTML tags

def clean(text):
    no_html = bs(text).get_text()
    clean = re.sub("[^a-z\s]+", " ", no_html, flags=re.IGNORECASE)
    return re.sub("(\s+)", " ", clean)

In [72]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def tokenize(text):
    clean_ = clean(text).lower()
    stemmed_tokens = [ps.stem(token) for token in clean_.split()]
    cl=[w for w in stemmed_tokens if not w in en_stopwords]
    return (" ".join(cl))

In [15]:
# def cleaning_reviews(review):
    
#     review=review.lower()
    
#     review = review.replace("<br /><br />"," ")
    
#     tokens=tokenizer.tokenize(review)
    
#     new_tokens = [token for token in tokens if token not in en_stopwords]
    
#     stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
#     cleaned_review = ' '.join(stemmed_tokens)
    
#     return cleaned_review

In [74]:
print(X_train[0])
tokenize(X_train[0])

mature intelligent and highly charged melodrama unbelivebly filmed in China in 1948. wei wei's stunning performance as the catylast in a love triangle is simply stunning if you have the oppurunity to see this magnificent film take it


'matur intellig highli charg melodrama unbelivebl film china wei wei stun perform catylast love triangl simpli stun oppurun see thi magnific film take'

In [75]:
All_cleaned_reviews=[tokenize(X_train[i]) for i in range(X_train.shape[0])]

In [78]:
print(X_train[125])
All_cleaned_reviews[125]

First of all i want to say Ang Lee Did a very good job on this one! I watched it yesterday and i was presently surprised. The story is very good, but all the ignorant people would say "This sucks people cant fly!" to them i say IT'S FICTION and that it is. This is not to be taken as a film about reality you could say this is a "fairytale". And a very pleasant to watch Asian fairytale. The image's can actually blow your mind. Because there so artistically filmed , Ang Lee has a very (unapreciated u might say) big talent. The fight scene's are very cool and beautifully brought to the viewer. But it's sad but this film didn't get the appreciation it should have gotten. But Ang Lee did fortunately get the attention he deserved with his blockbuster broke back mountain. So even for viewers who are not interested in the story the images are entertaining enough!


'first want say ang lee veri good job thi one watch yesterday wa present surpris stori veri good ignor peopl would say thi suck peopl cant fli say fiction thi not taken film realiti could say thi fairytal veri pleasant watch asian fairytal imag actual blow mind becaus artist film ang lee ha veri unapreci u might say big talent fight scene veri cool beauti brought viewer sad thi film get appreci gotten ang lee fortun get attent deserv hi blockbust broke back mountain even viewer not interest stori imag entertain enough'

### 3. Vectorizing 

In [79]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [80]:
cv=CountVectorizer(ngram_range=(1,2))
tfidf=TfidfVectorizer(ngram_range=(1,2))

In [94]:
X_vector=cv.fit_transform(All_cleaned_reviews) ### has given 87% accuracy
print(X_vector.shape)

(40000, 2207176)


In [95]:
X_vector=tfidf.fit_transform(All_cleaned_reviews)

### 4. cleaning Test data

In [83]:
cleaned_test_reviews=[tokenize(X_test[i]) for i in range(X_test.shape[0])]

### 5. Vectorization on test case

In [96]:
X_test_vector=cv.transform(cleaned_test_reviews)
X_test_vector = tfidf.transform(cleaned_test_reviews)
print(X_test_vector.shape)

(10000, 2207176)


### 6. Building training model 

In [97]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB, GaussianNB

In [98]:
mnb = MultinomialNB()
print(mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


### 7.Training

In [99]:
mnb.fit(tf_vector,Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### 8. Predictions

In [100]:
#Predictions
predict=mnb.predict(X_test_vector)

In [101]:
predict

array([0, 0, 0, ..., 1, 1, 0])

In [102]:
prediction=le.inverse_transform(predict)

In [103]:
prediction

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'neg'], dtype=object)

In [104]:
Id=np.arange(0,X_test.shape[0])
Id

array([   0,    1,    2, ..., 9997, 9998, 9999])

### 9. Converting to CSV file

In [105]:
dataframe=pd.DataFrame({"Id":Id,"label":prediction})
dataframe.to_csv("submission.csv",index=False)

### Getting 87% Accuracy With Countvectorizer
### Getting 88% Accuracy with tfiDfVectorizer