In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
train_dataset_zip = '/content/drive/My Drive/movie dataset/train.tsv.zip'

In [None]:
!unzip '/content/drive/My Drive/movie dataset/train.tsv.zip'

Archive:  /content/drive/My Drive/movie dataset/train.tsv.zip
  inflating: train.tsv               


# Step 1 --> Load dataset

In [None]:
import pandas as pd


In [None]:
train_dataset = pd.read_csv('/content/train.tsv',sep = '\t')

train_dataset.head(5)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


# Cleaning text , tokenize it and lemmatize

In [None]:
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import re
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
text = []

for sent in tqdm(train_dataset['Phrase']):
   # remove non_alphabetic character

   clean_sent = re.sub('[^a-zA-Z]'," ",sent)

   # tokenize sentence

   token_sent = word_tokenize(clean_sent.lower())

   # lemmatize the text

   lemma = [lemmatizer.lemmatize(w) for w in token_sent if not w in stopwords.words('english') or w =="not" or w=="no"]
   text.append(lemma)



100%|██████████| 156060/156060 [03:02<00:00, 857.10it/s]


# Convert target into category

In [None]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [None]:
target = train_dataset['Sentiment']
y_target = to_categorical(target)
num_classes = y_target.shape[1]


# getting the num of unique words and maximum length 

In [None]:
unique_word = set()
max_len = 0
for sent in tqdm(text):
  unique_word.update(sent)

  if len(sent)>max_len:
    max_len = len(sent)

print(f"maximum length {max_len}")
print(f"length of unique words {len(list(unique_word))}")    




100%|██████████| 156060/156060 [00:00<00:00, 1143052.12it/s]

maximum length 30
length of unique words 13627





# split train_dataset into train and validation_set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_val,Y_train,Y_val = train_test_split(text,y_target,test_size=.2,stratify=y_target,random_state = 2)

# Convert text into Sequence and use padding on sequence

In [None]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer


In [None]:
tokenizer = Tokenizer(num_words=len(list(unique_word)))

tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

# padding 

X_train = sequence.pad_sequences(X_train,maxlen = max_len)
X_val = sequence.pad_sequences(X_val,maxlen = max_len)


# Create the model

In [None]:
from keras.layers import Dense , Dropout , Embedding , LSTM

from keras.models import Sequential

from keras.optimizers import Adam

In [None]:
model = Sequential()

model.add(Embedding(len(list(unique_word)),300,input_length = max_len))

model.add(LSTM(128,dropout=.5,return_sequences=True))

model.add(LSTM(64,dropout=.5,return_sequences=False))

model.add(Dense(100,activation='relu'))

model.add(Dropout(0.5))

model.add(Dense(num_classes , activation='softmax'))

model.compile(loss = 'categorical_crossentropy',optimizer=Adam(lr=.001),metrics=['accuracy'])

# fit model

In [None]:

model.fit(X_train,Y_train,validation_data=(X_val,Y_val),epochs=6,batch_size=256,verbose=2)