Exploring The Data


In [33]:
#importing libraries and downloading packages

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
nltk.download('punkt') 
nltk.download('stopwords') 
nltk.download('movie_reviews') 
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Mount Drive

In [34]:
#mounting drive 

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Reading Dataset


In [35]:
#Read database using pandas library

dataset =  pd.read_csv('/content/drive/My Drive/train.tsv', sep='\t')
dataset = dataset.dropna()
dataset.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [36]:
dataset.shape

(156060, 4)

In [37]:
#Value counts for each sentiment
dataset.Sentiment.value_counts()


2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

Adjustable Parameters

In [0]:
#Data cleaning

remove_fPunct = True
fTokenizaton = True
fStopwords = True
fStemming = False
fLemmatization = True


Data Cleaning | Punctuations

In [39]:
#punctuations
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [0]:
def remove_punctuation(text):
  txt_nonpunct = "".join([a for a in text if a not in string.punctuation])
  return txt_nonpunct 

In [0]:
if remove_fPunct:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: remove_punctuation(x)) 

In [0]:
#Tokenization
import re

def tokenize(text):
  tokens = re.split('\W+', text)
  return tokens 

if fTokenizaton:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: tokenize(x.lower()))

In [43]:
#Stopwords
import nltk 

stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [0]:
#remove stopwords
def remove_stopwords(txt_tokenized):
  txt_clean = [word for word in txt_tokenized if word not in stopwords]
  return txt_clean

if fStopwords:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: remove_stopwords(x))

In [0]:
#Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [0]:
def stemming(tokenized_text):
  text = [ps.stem(word) for word in tokenized_text]
  return text

In [0]:
if fStemming:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: stemming(x))

In [0]:
#Lemmatization
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

def lemmatization(token_txt):
  text = [wn.lemmatize(word) for word in token_txt]
  return text


In [0]:
if fLemmatization:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: lemmatization(x))

In [50]:
#Printing head of the dataset
dataset.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"[series, escapade, demonstrating, adage, good,...",1
1,2,1,"[series, escapade, demonstrating, adage, good,...",2
2,3,1,[series],2
3,4,1,[],2
4,5,1,[series],2


In [51]:
#Split the database

X_train, X_test, Y_train, Y_test = train_test_split(dataset['Phrase'], dataset['Sentiment'], test_size=0.3, random_state=2003)
documents = []
X_train = np.array(X_train.values.tolist())
Y_train = np.array(Y_train.values.tolist())
for i in range(len(X_train)):
  documents.append([list(X_train[i]), Y_train[i]]) 

X_test = np.array(X_test.values.tolist())
Y_test = np.array(Y_test.values.tolist())
for i in range(len(X_test)):
  documents.append([list(X_test[i]), Y_test[i]]) 

print(documents[0][0])


dataset = pd.DataFrame(documents, columns=['text', 'sentiment']) 
dataset['join'] = dataset.text.apply(' '.join)

dataset.head()

['age']


Unnamed: 0,text,sentiment,join
0,[age],2,age
1,"[gorgeous, epic]",4,gorgeous epic
2,"[fan, grossout, comedy]",2,fan grossout comedy
3,"[filmmaker, ascends, literally, olympus, art, ...",4,filmmaker ascends literally olympus art world
4,"[twisting, mystery]",2,twisting mystery


In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset['join'],  dataset['sentiment'], test_size=0.3, random_state=2003)

In [0]:
#Vectorization 

from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from keras.utils import to_categorical

vectorizer = TfidfVectorizer(max_features = 2500)#, # ngram_range=(1, 1)) 
X = vectorizer.fit_transform(dataset["join"]) 
Y = dataset['sentiment'] 
 
X_train = vectorizer.transform(X_train).toarray()
Y_train = Y_train 
X_test = vectorizer.transform(X_test).toarray()
Y_test = Y_test



In [54]:
Y_test

13510     2
61932     0
82549     1
137718    3
121990    2
         ..
94224     2
135456    2
154729    1
23031     1
57870     2
Name: sentiment, Length: 46818, dtype: int64

In [0]:
#importing keras and thhen model parameters
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras import backend as K

In [0]:
#defining batch size and number of epochs
batch_size = 64
num_classes = 5
epochs = 12

In [57]:
X_train.shape

(109242, 2500)

In [58]:
Y_train = keras.utils.to_categorical(Y_train, num_classes)
Y_test = keras.utils.to_categorical(Y_test, num_classes)
Y_test

array([[0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [0]:
#Code for recall, precision and F1 score
from keras import backend as K

def recall_measure(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_measure(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_measure(y_true, y_pred):
    precision = precision_measure(y_true, y_pred)
    recall = recall_measure(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [60]:
#Defining the model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3,
                 activation='relu',
                 input_shape=(2500,1)))

model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))

model.add(MaxPooling1D(pool_size=1))
model.add(Dropout(rate = 0.25))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 2498, 64)          256       
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 2496, 64)          12352     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2496, 64)          0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 2496, 64)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 159744)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                10223680  
_________________________________________________________________
dense_4 (Dense)              (None, 5)                

In [0]:
#Compile the model
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Nadam(),
              metrics=['accuracy',f1_measure,precision_measure,recall_measure])

In [0]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [63]:
#Train and test model to get results
model.fit(X_train, Y_train,
          batch_size=64,
          epochs=15)
score = model.evaluate(X_test, Y_test, verbose=0)




Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [64]:
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('F1 score:', score[2])
print('Precision:', score[3])
print('recall:', score[4])



Test loss: 1.1009235153021433
Test accuracy: 0.6270665128796616
F1 score: 0.6255403667799131
Precision: 0.5679140529912042
recall: 0.6993677645350079


In [0]:
from keras.models import load_model

model.save('1104360_1dconv_reg.h5')

In [70]:
from keras.models import load_model

model = load_model('/content/1104360_1dconv_reg.h5', 
                   custom_objects = {'f1:' f1_measure,  'precision:' precision_measure, 'recall :' recall_measure})

SyntaxError: ignored