In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
import string
#for training
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.model_selection import cross_val_score
from keras.utils import to_categorical

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Using TensorFlow backend.


In [0]:
url= 'https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv'
dataset = pd.read_csv(url, sep='\t')

In [4]:
dataset.shape

(156060, 4)

In [5]:
dataset.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [6]:
print(dataset.isnull().values.any())

False


In [7]:
dataset['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [0]:
porter = PorterStemmer()
lancaster=LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_en = stopwords.words("english")
punctuations="?:!.,;'\"-()"
#parameters to adjust to see the impact on outcome
remove_stopwords = True
useStemming = False
useLemma = True
removePuncs = True

In [0]:
cleanReview=[]
for x in range(0,len(dataset['Phrase'].values)):
    tmpReview=[]
    for w in nltk.word_tokenize(dataset['Phrase'].values[x]):
        newWord = str(w).lower() #Set newWork to be the updated word
        if remove_stopwords and (w in stopwords_en):#if the word is a stopword & we want to remove stopwords
            continue #skip the word and don’t had it to the normalized review
        if removePuncs and (w in punctuations):#if the word is a punc. & we want to remove punctuations
            continue #skip the word and don’t had it to the normalized review
        if useStemming: #if useStemming is set to True
            #Keep one stemmer commented out
            #newWord = porter.stem(newWord) #User porter stemmer
            newWord = lancaster.stem(newWord) #Use Lancaster stemmer
        if useLemma:
            newWord = wordnet_lemmatizer.lemmatize(newWord)
        tmpReview.append(newWord) #Add normalized word to the tmp review
    cleanReview.append(' '.join(tmpReview))

In [10]:
len(cleanReview)

156060

In [0]:
dataset['UpdatedReview']=cleanReview

In [12]:
dataset.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,UpdatedReview
0,1,1,A series of escapades demonstrating the adage ...,1,a series escapade demonstrating adage good goo...
1,2,1,A series of escapades demonstrating the adage ...,2,a series escapade demonstrating adage good goose
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series
5,6,1,of escapades demonstrating the adage that what...,2,escapade demonstrating adage good goose
6,7,1,of,2,
7,8,1,escapades demonstrating the adage that what is...,2,escapade demonstrating adage good goose
8,9,1,escapades,2,escapade
9,10,1,demonstrating the adage that what is good for ...,2,demonstrating adage good goose


In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset['UpdatedReview'],dataset['Sentiment'], test_size=0.3, random_state=2003)

In [14]:
#Here, X_train and X_test will have mutually exclusive sets of 109242 and 46818 samples. 
print(X_train.shape)
print(X_test.shape)

(109242,)
(46818,)


In [15]:
Y_train.value_counts()

2    55595
3    23055
1    19203
4     6468
0     4921
Name: Sentiment, dtype: int64

In [0]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

# Transform each text into a vector of word counts
#vectorizer_tfid = CountVectorizer(stop_words="english",ngram_range=(1, 3), max_features=4000)
#vectorizer_tfid = TfidfVectorizer(stop_words="english",ngram_range=(1, 1))
vectorizer_tfid = TfidfVectorizer(ngram_range=(1, 3),max_features=5000)

X = vectorizer_tfid.fit_transform(dataset["UpdatedReview"])
Y = dataset['Sentiment']
x_train = vectorizer_tfid.transform(X_train)
y_train = Y_train
x_test = vectorizer_tfid.transform(X_test)
y_test = Y_test

In [17]:
x_train.shape,x_test.shape

((109242, 5000), (46818, 5000))

In [18]:
x_train

<109242x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 391930 stored elements in Compressed Sparse Row format>

In [19]:
x_train_np = x_train.toarray()
y_train_np = np.array(y_train)
# Convert the testing data
x_test_np = x_test.toarray()
y_test_np = np.array(y_test)
print(x_train_np.shape)
print(y_train_np.shape)

(109242, 5000)
(109242,)


In [20]:
x_train_np=x_train_np.reshape(x_train_np.shape[0],x_train_np.shape[1],1)
x_test_np=x_test_np.reshape(x_test_np.shape[0],x_test_np.shape[1],1)
print(x_train_np.shape)
print(x_test_np.shape)

y_train_np = to_categorical(y_train_np)
print(y_train_np.shape)
y_test_np = to_categorical(y_test_np)
print(y_test_np.shape)


(109242, 5000, 1)
(46818, 5000, 1)
(109242, 5)
(46818, 5)


In [0]:
#bow_feature= pd.DataFrame(x_train_np, columns=vectorizer.get_feature_names())
#bow_feature.head(30)

In [0]:
from keras import backend as K
def recall_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true*y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0 ,1)))
  recall = true_positives / (possible_positives + K.epsilon())
  return recall

def precision_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true*y_pred, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0 ,1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  return precision

def f1_m(y_true, y_pred):
  precision = precision_m(y_true, y_pred)
  recall = recall_m(y_true, y_pred)
  return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
from keras.models import Sequential
from keras import layers
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from keras.layers import Activation
from keras.layers import AveragePooling1D
from keras import optimizers

In [24]:
cnn_model = Sequential()
#cnn_model.add(layers.Embedding(1715, 100))
cnn_model.add(Conv1D(filters=128, kernel_size=1, activation='tanh',input_shape=(x_train_np.shape[1],x_train_np.shape[2])))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(AveragePooling1D(pool_size=2))
cnn_model.add(Dropout(0.2))
cnn_model.add(Conv1D(filters=64, kernel_size=1, activation='relu'))
cnn_model.add(Flatten())
cnn_model.add(layers.Dense(5, activation='softmax'))







Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
#ada = optimizers.Adamax(lr=0.00001, decay=1e-6)

In [26]:
cnn_model.compile(optimizer="Adadelta",loss='categorical_crossentropy', metrics=['accuracy',f1_m,precision_m,recall_m])
cnn_model.summary()



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 5000, 128)         256       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2500, 128)         0         
_________________________________________________________________
average_pooling1d_1 (Average (None, 1250, 128)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1250, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1250, 64)          8256      
_________________________________________________________________
flatten_1 (Flatten)          (None, 80000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)              

In [27]:
#testing results
print(cnn_model.metrics_names)
cnn_model.evaluate(x_test_np, y_test_np)

['loss', 'acc', 'f1_m', 'precision_m', 'recall_m']








[1.6094513139179227, 0.1738006749540775, 0.0, 0.0, 0.0]

In [0]:
output= cnn_model.fit(x_train_np, y_train_np, epochs=100, verbose=2, validation_split=0.2, batch_size = 512)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 87393 samples, validate on 21849 samples
Epoch 1/100
 - 27s - loss: 1.3083 - acc: 0.5054 - f1_m: 0.2741 - precision_m: 0.2771 - recall_m: 0.2739 - val_loss: 1.2918 - val_acc: 0.5083 - val_f1_m: 0.5083 - val_precision_m: 0.5083 - val_recall_m: 0.5083
Epoch 2/100
 - 27s - loss: 1.2829 - acc: 0.5091 - f1_m: 0.2868 - precision_m: 0.3718 - recall_m: 0.2738 - val_loss: 1.2659 - val_acc: 0.5083 - val_f1_m: 0.5099 - val_precision_m: 0.5160 - val_recall_m: 0.5040
Epoch 3/100
 - 27s - loss: 1.1690 - acc: 0.5387 - f1_m: 0.4586 - precision_m: 0.6562 - recall_m: 0.3732 - val_loss: 1.1211 - val_acc: 0.5520 - val_f1_m: 0.5316 - val_precision_m: 0.6161 - val_recall_m: 0.4676
Epoch 4/100
 - 27s - loss: 1.0772 - acc: 0.5705 - f1_m: 0.5199 - precision_m: 0.6580 - recall_m: 0.4312 - val_loss: 1.0981 - val_acc: 0.5599 - val_f1_m: 0.5363 - val_precision_m: 0.6197 - val_recall_m: 0.4728
Epoch 5/100
 - 27s

In [0]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.plot(output.history['acc'])
plt.plot(output.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(output.history['loss'])
plt.plot(output.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [0]:
#to save the model
!apt-get install libhdf5-serial-dev
import h5py
cnnmodel.save('/content/drive/My Drive/NLP_Assignment_2/0876833_1dconv_reg.h5')

In [0]:
#loading the model
from keras.models import load_model
!apt-get install libhdf5-serial-dev
import h5py
cnnmodel = load_model('/content/drive/My Drive/NLP_Assignment_2/0876888_1dconv_reg.h5', custom_objects={'f1_m': f1_m,'precision_m':precision_m,'recall_m':recall_m})