#**Sentiment Analysis based on Movie Review**

In [24]:
import numpy as np
import pandas as pd 
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import zipfile

In [5]:
zip = zipfile.ZipFile('train.tsv.zip')
zip.extractall()

In [6]:
zip = zipfile.ZipFile('test.tsv.zip')
zip.extractall()

In [8]:
df_train=pd.read_csv('train.tsv',sep='\t')
df_train=df_train.dropna(how='any')
df_train=df_train.drop(columns=['PhraseId','SentenceId'])
df_train

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2
...,...,...
156055,Hearst 's,2
156056,forced avuncular chortles,1
156057,avuncular chortles,3
156058,avuncular,2


In [14]:
df_train['Sentiment'].unique()

array([1, 2, 3, 4, 0])

In [12]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
corpus1=[]
corpus2=[]
for i in range(0, 156060):
  review = re.sub('[^a-zA-Z]', ' ', df_train['Phrase'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y=df_train.iloc[:,-1].values

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [29]:
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, learning_rate='adaptive', max_iter=100)
model.fit(X_train,y_train)



MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=100,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [30]:
res = model.predict(X_test)
res=np.round(res)
np.set_printoptions(precision=2)
print(res)

[2 4 2 ... 1 2 2]


In [31]:
from sklearn.metrics import accuracy_score
print("Accuracy Score for the algorithm=>{}%".format(round(accuracy_score(y_test,res)*100),2))

Accuracy Score for the algorithm=>63%


#**Testing the Model**

In [32]:
df_test=pd.read_csv('test.tsv',sep='\t')
df_test=df_test.dropna(how='any')
df_test=df_test.drop(columns=['PhraseId','SentenceId'])
df_test

Unnamed: 0,Phrase
0,An intermittently pleasing but mostly routine ...
1,An intermittently pleasing but mostly routine ...
2,An
3,intermittently pleasing but mostly routine effort
4,intermittently pleasing but mostly routine
...,...
66287,"A long-winded , predictable scenario ."
66288,"A long-winded , predictable scenario"
66289,"A long-winded ,"
66290,A long-winded


In [34]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
corpus1=[]
corpus2=[]
for i in range(0, 66292):
  review = re.sub('[^a-zA-Z]', ' ', df_train['Phrase'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [36]:
res = model.predict(X_test)
res=np.round(res)
np.set_printoptions(precision=2)
print(res)

[2 4 2 ... 1 2 2]


In [None]:
for i in range(len(df_test)):
  for j in range(len(res)):
    if i==j:
      print("The catgory for this phrase{0} is{1}" .format(df_test['Phrase'][i],res[i]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
The catgory for this phraseWith a cast that includes some of the top actors working in independent film is3
The catgory for this phrasea cast that includes some of the top actors working in independent film is2
The catgory for this phrasea cast is3
The catgory for this phrasecast is2
The catgory for this phrasethat includes some of the top actors working in independent film is2
The catgory for this phraseincludes some of the top actors working in independent film is1
The catgory for this phraseincludes is3
The catgory for this phrasesome of the top actors working in independent film is1
The catgory for this phraseof the top actors working in independent film is3
The catgory for this phrasethe top actors working in independent film is2
The catgory for this phrasethe top actors is2
The catgory for this phrasetop actors is2
The catgory for this phrasetop is2
The catgory for this phraseworking in independent film is2
The catg