In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:
train= pd.read_csv('train.tsv',sep="\t")
test= pd.read_csv('test.tsv',sep="\t")



In [4]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [6]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer
from string import punctuation
import re

stemmer= SnowballStemmer('english')
lemma= WordNetLemmatizer()

In [7]:
def text_prep(review_col):
    review_corpus=[]
    for i in range(0,len(review_col)):
        review= str(review_col[i])
        review= re.sub('[^a-zA-Z]'," ",review)
        
        review=[lemma.lemmatize(w) for  w in word_tokenize(str(review).lower())]
        
        review= " ".join(review)
        review_corpus.append(review)
    return review_corpus


In [8]:
train['clean_review']=text_prep(train.Phrase.values)

In [9]:
train["Sentiment"].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

## balencing the data

In [10]:
from sklearn.utils import resample
train_2= train[train["Sentiment"]==2]
train_1= train[train["Sentiment"]==1]
train_3= train[train["Sentiment"]==3]
train_4= train[train["Sentiment"]==4]
train_5= train[train['Sentiment']==0]

train_2_sample= resample(train_2,replace=True,n_samples=75000,random_state=123)
train_1_sample= resample(train_1,replace=True,n_samples=75000,random_state=123)
train_3_sample= resample(train_3,replace=True,n_samples=75000,random_state=123)
train_4_sample= resample(train_4,replace=True,n_samples=75000,random_state=123)
train_5_sample= resample(train_5,replace=True,n_samples=75000,random_state=123)


df_upsampled= pd.concat([train_2,train_1_sample,train_3_sample,train_4_sample,train_5_sample])
df_upsampled.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
1,2,1,A series of escapades demonstrating the adage ...,2,a series of escapade demonstrating the adage t...
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series
5,6,1,of escapades demonstrating the adage that what...,2,of escapade demonstrating the adage that what ...


In [11]:
train.shape

(156060, 5)

In [12]:
df_upsampled.shape

(379582, 5)

In [13]:
test["clean_review"]= text_prep(test["Phrase"].values)
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,clean_review
0,156061,8545,An intermittently pleasing but mostly routine ...,an intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...,an intermittently pleasing but mostly routine ...
2,156063,8545,An,an
3,156064,8545,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


In [14]:
text= " ".join(df_upsampled.loc[df_upsampled.Sentiment ==4, 'Phrase'].values)

In [15]:
from nltk.util import ngrams
text_trigram= [i for i in ngrams(text.split(),3)]

In [16]:
from collections import Counter
Counter(text_trigram).most_common(30)

[(('one', 'of', 'the'), 1644),
 (('of', 'the', 'year'), 832),
 (('of', 'the', 'best'), 677),
 (('of', 'the', 'most'), 612),
 (('is', 'one', 'of'), 407),
 (('One', 'of', 'the'), 370),
 ((',', 'and', 'the'), 333),
 (('the', 'year', "'s"), 326),
 (('It', "'s", 'a'), 323),
 (('the', 'edge', 'of'), 300),
 (('it', "'s", 'a'), 299),
 (('a', 'movie', 'that'), 297),
 (('of', 'your', 'seat'), 273),
 (('the', 'film', 'is'), 267),
 (('the', 'kind', 'of'), 267),
 (('.', 'is', 'a'), 264),
 (('the', 'film', "'s"), 264),
 (('as', 'one', 'of'), 254),
 ((',', 'the', 'film'), 253),
 (('edge', 'of', 'your'), 249),
 ((',', 'this', 'is'), 236),
 (('as', 'well', 'as'), 231),
 ((',', 'it', "'s"), 226),
 (('film', 'that', 'is'), 223),
 (('.', 'It', "'s"), 218),
 (('a', 'film', 'that'), 211),
 ((',', 'funny', ','), 208),
 (('some', 'of', 'the'), 206),
 (('year', "'s", 'best'), 188),
 (('a', 'solid', 'cast'), 178)]

In [17]:
from nltk.tokenize import TweetTokenizer
tokenizer= TweetTokenizer()
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer= TfidfVectorizer(ngram_range=(1,2), tokenizer= tokenizer.tokenize)
full_text = list(df_upsampled['clean_review'].values) + list(test['clean_review'].values)

vectorizer.fit(full_text)

df_upsampled_vectorize= vectorizer.transform(df_upsampled["clean_review"].values)
test_vectorized= vectorizer.transform(test["clean_review"])
test1 = test["clean_review"]



In [18]:
y= df_upsampled["Sentiment"]

## ML model

### Logistic regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier 

In [20]:
logreg=LogisticRegression()
ovr= OneVsRestClassifier(logreg)

In [21]:
%%time
ovr.fit(df_upsampled_vectorize,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Wall time: 37 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


OneVsRestClassifier(estimator=LogisticRegression())

In [22]:
from sklearn.model_selection import cross_val_score
score= cross_val_score(ovr,df_upsampled_vectorize,y,scoring='accuracy',cv=5,n_jobs=-1)



In [23]:
print("mean: {}, std :{}".format(round(np.mean(score),2),round(np.std(score),2)))

mean: 0.73, std :0.0


In [24]:
score

array([0.73721301, 0.7293755 , 0.73167712, 0.72573634, 0.72457716])

### Linear SVC

In [25]:
from sklearn.svm import LinearSVC
svc= LinearSVC(dual=False)
scores = cross_val_score(svc, df_upsampled_vectorize, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 77.44%, std 0.46.


### Naive bayes

In [26]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
scores= cross_val_score(model,df_upsampled_vectorize,y,cv=3,scoring="accuracy",n_jobs=-1)
print("mean  :",round(np.mean(scores),2))
print("std :",round(np.std(scores),2))

mean  : 0.59
std : 0.0


In [27]:
from tensorflow import keras

from tensorflow.keras.utils import to_categorical
X= df_upsampled["clean_review"]

Y= to_categorical(df_upsampled["Sentiment"].values)
Y

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]], dtype=float32)

In [28]:
## spliting training and testing

from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val = train_test_split(X,Y, test_size=.25,random_state=42)

print(x_train.shape, y_train.shape)
print(x_val.shape,y_val.shape)

(284686,) (284686, 5)
(94896,) (94896, 5)


In [29]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [30]:
#total number of words/features

In [31]:
all_words= ' '.join(x_train)
all_words= word_tokenize(all_words)


In [32]:
from nltk import FreqDist

dist= FreqDist(all_words)
num_unique_word= len(dist)
num_unique_word

13738

In [33]:
#max_word_lenght_of_reviews
lst_of_length= []
for text in x_train:
    word= word_tokenize(text)
    length= len(word)
    lst_of_length.append(length)
    
max_length= np.max(lst_of_length)
max_length
    

48

In [34]:
max_features= num_unique_word
max_word= max_length
batch_size=128
epoch= 5
num_classes= 5

In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [36]:
#tokenizing of words

tokenizer = Tokenizer(num_words= max_features)
tokenizer.fit_on_texts(x_train)
x_train= tokenizer.texts_to_sequences(x_train)
x_val= tokenizer.texts_to_sequences(x_val)

x_test= tokenizer.texts_to_sequences(test1)

In [38]:
# padding
x_train= pad_sequences(x_train, maxlen= max_word)
x_val= pad_sequences(x_val, maxlen= max_word)
x_test= pad_sequences(x_test,maxlen=max_word)



In [55]:
x_val.shape

(94896, 48)

In [49]:
x_train.shape

(284686, 48)

## LSTM

In [44]:
model1= Sequential()
model1.add(Embedding(max_features,100,mask_zero=True))

model1.add(LSTM(64,dropout=0.4,recurrent_dropout=0.4,return_sequences=True))
model1.add(LSTM(32,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model1.add(Dense(num_classes,activation='softmax'))





In [45]:
model1.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model1.summary()

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         1373800   
_________________________________________________________________
lstm (LSTM)                  (None, None, 64)          42240     
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense (Dense)                (None, 5)                 165       
Total params: 1,428,621
Trainable params: 1,428,621
Non-trainable params: 0
_________________________________________________________________


In [47]:
model1.fit(x_train, y_train, validation_data=(x_val,y_val),epochs= 3,batch_size= batch_size,verbose=1)

Epoch 1/3

KeyboardInterrupt: 

### CNN+GRU

In [57]:
from tensorflow.keras.layers import Conv1D,MaxPooling1D,Dropout,Flatten,Dense,GRU
model2= Sequential()
model2.add(Embedding(max_features,100,input_length=max_word))
model2.add(Conv1D(64,kernel_size=3,padding='same',activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.25))
model2.add(GRU(128,return_sequences=True))
model2.add(Dropout(0.3))
model2.add(Flatten())
model2.add(Dense(128,activation='relu'))
model2.add(Dropout(0.5))
model2.add(Dense(5,activation='softmax'))
model2.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model2.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 48, 100)           1373800   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 48, 64)            19264     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 24, 64)            0         
_________________________________________________________________
gru_3 (GRU)                  (None, 24, 128)           74496     
_________________________________________________________________
dropout_9 (Dropout)          (None, 24, 128)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 3072)             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [58]:
%%time
model2.fit(x_train,y_train,validation_data=(x_val,y_val),epochs= 30,batch_size=batch_size,verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Wall time: 13min 11s


<keras.callbacks.History at 0x28341ba9488>

In [71]:
result=model2.predict(x_test)


In [73]:
len(result)

66292

In [1]:
#testing with manual input

In [77]:
def get_class(result):
    output=[]
    for i in range(len(result)):
        r= np.argmax(result[i])
        output.append(r)
    return np.array(output)
    

In [64]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [66]:
tokens = tokenizer.texts_to_sequences(texts)
tokens_pad = pad_sequences(tokens, maxlen=max_word)
tokens_pad.shape


(8, 48)

In [78]:
pred= model2.predict(tokens_pad)
get_class(pred)

array([4, 3, 3, 2, 1, 0, 1, 0], dtype=int64)

In [59]:
model2.save("model.h5")

In [89]:
model2.save_weights('modelweights')

In [83]:
import pickle
pickle.dump(tokenizer,open("tokenizer.pkl","wb"))

In [85]:
from tensorflow.keras.models import load_model
loaded_model= load_model("model.h5")
loaded_tokenizer= pickle.load(open("tokenizer.pkl","rb"))

In [93]:


token= loaded_tokenizer.texts_to_sequences(texts)
token_pads= pad_sequences(token,maxlen= max_word)

pred2= loaded_model.predict(token_pads)#got an error here. failed to convert nparray to tensor
get_class(pred2)

array([4, 3, 3, 2, 1, 0, 1, 0], dtype=int64)

In [94]:
print(max_word)

48


In [95]:
"""thanks to joseph
   reference: [https://github.com/joseph10081987/Machine-Learning_new/blob/master/Movie%20Review_DL.ipynb]
"""

'thanks to joseph\n   reference: [https://github.com/joseph10081987/Machine-Learning_new/blob/master/Movie%20Review_DL.ipynb]\n'