In [39]:
import pandas as pd
import numpy as np
import nltk
import sklearn
import operator
import random
import math
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

pos_train = pd.read_csv('https://raw.githubusercontent.com/PuruTiwari/CMT307-C1978887/master/datasets_coursework1/IMDb/train/imdb_train_pos.txt', sep="\n", header=None)
neg_train = pd.read_csv('https://raw.githubusercontent.com/PuruTiwari/CMT307-C1978887/master/datasets_coursework1/IMDb/train/imdb_train_neg.txt', sep="\n", header=None)
pos_train = pos_train.iloc[:,0].as_matrix()
neg_train = neg_train.iloc[:,0].as_matrix()

pos_dev = pd.read_csv('https://raw.githubusercontent.com/PuruTiwari/CMT307-C1978887/master/datasets_coursework1/IMDb/dev/imdb_dev_pos.txt', sep="\n", header=None)
neg_dev = pd.read_csv('https://raw.githubusercontent.com/PuruTiwari/CMT307-C1978887/master/datasets_coursework1/IMDb/dev/imdb_dev_neg.txt', sep="\n", header=None)
pos_dev = pos_dev.iloc[:,0].as_matrix()
neg_dev = neg_dev.iloc[:,0].as_matrix()

pos_test = pd.read_csv('https://raw.githubusercontent.com/PuruTiwari/CMT307-C1978887/master/datasets_coursework1/IMDb/test/imdb_test_pos.txt', sep="\n", header=None)
neg_test = pd.read_csv('https://raw.githubusercontent.com/PuruTiwari/CMT307-C1978887/master/datasets_coursework1/IMDb/test/imdb_test_neg.txt', sep="\n", header=None)
pos_test = pos_test.iloc[:,0].as_matrix()
neg_test = neg_test.iloc[:,0].as_matrix()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  
  from ipykernel import kernelapp as app


Preprocessing Data using lemmatizer and stopwords


In [40]:

lemmatizer = nltk.stem.WordNetLemmatizer()

def get_list_tokens(string):
  sentence_split=nltk.tokenize.sent_tokenize(string)
  list_tokens=[]
  for sentence in sentence_split:
    list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
    for token in list_tokens_sentence:
      list_tokens.append(lemmatizer.lemmatize(token).lower())
  return list_tokens

# get the english stopwords list from nltk
stopwords=set(nltk.corpus.stopwords.words('english'))
# add more words to the stopword list
stopwords.add("/")
stopwords.add(".")
stopwords.add(">")
stopwords.add("''")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")
stopwords.add("<")
stopwords.add("br")
stopwords.add("'s")
stopwords.add(")")
stopwords.add("(")

dict_word_frequency = {}

for pos_review in pos_train:
  sentence_tokens = get_list_tokens(pos_review)
  for word in sentence_tokens:
    if word in stopwords: continue
    if word not in dict_word_frequency: dict_word_frequency[word]=1
    else: dict_word_frequency[word]+=1

for neg_review in neg_train:
  sentence_tokens = get_list_tokens(neg_review)
  for word in sentence_tokens:
    if word in stopwords: continue
    if word not in dict_word_frequency: dict_word_frequency[word]=1
    else: dict_word_frequency[word]+=1

sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:1000]
i=0
for word, frequency in sorted_list[:15]:
  i+=1
  print (str(i)+". "+word+" - "+str(frequency))

vocabulary=[]
for word,frequency in sorted_list:
  vocabulary.append(word)
def get_vector_text(list_vocab,string):
  vector_text=np.zeros(len(list_vocab))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(list_vocab):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(word)
  return vector_text


X_train=[]
Y_train=[]
for pos_review in pos_train:
  vector_pos_review=get_vector_text(vocabulary,pos_review)
  X_train.append(vector_pos_review)
  Y_train.append(1)
for neg_review in neg_train:
  vector_neg_review=get_vector_text(vocabulary,neg_review)
  X_train.append(vector_neg_review)
  Y_train.append(0)

X_dev=[]
Y_dev=[]
for pos_review in pos_dev:
  vector_pos_review=get_vector_text(vocabulary,pos_review)
  X_dev.append(vector_pos_review)
  Y_dev.append(1)
for neg_review in neg_dev:
  vector_neg_review=get_vector_text(vocabulary,neg_review)
  X_dev.append(vector_neg_review)
  Y_dev.append(0)

X_test=[]
Y_test=[]
for pos_review in pos_test:
  vector_pos_review=get_vector_text(vocabulary,pos_review)
  X_test.append(vector_pos_review)
  Y_test.append(1)
for neg_review in neg_test:
  vector_neg_review=get_vector_text(vocabulary,neg_review)
  X_test.append(vector_neg_review)
  Y_test.append(0)

TF_train = []
for f_vector in X_train:
  TF_train.append(f_vector/sum(f_vector))

X_train_temp = np.asarray(X_train)
IDF_vec = []
for i in range(X_train_temp.shape[1]):

  count_temp = 0
  for j in range(X_train_temp.shape[0]):
    if X_train_temp[j,i] != 0:
      count_temp += 1

  IDF_vec.append(math.log(X_train_temp.shape[0]/(count_temp+1)))

TF_IDF_train = []
for TF_vector in TF_train:
  TF_IDF_train.append(TF_vector * IDF_vec)

TF_dev = []
for f_vector in X_dev:
  TF_dev.append(f_vector/sum(f_vector))

TF_IDF_dev = []
for TF_vector in TF_dev:
  TF_IDF_dev.append(TF_vector * IDF_vec)

TF_test = []
for f_vector in X_test:
  TF_test.append(f_vector/sum(f_vector))

TF_IDF_test = []
for TF_vector in TF_test:
  TF_IDF_test.append(TF_vector * IDF_vec)  

1. movie - 29647
2. wa - 29577
3. film - 26929
4. n't - 19639
5. one - 15987
6. ! - 14847
7. like - 11876
8. ha - 9893
9. ? - 9593
10. time - 8589
11. good - 8376
12. character - 8318
13. would - 7867
14. ... - 7722
15. even - 7321


Feature selection


In [41]:
from sklearn.feature_extraction.text import CountVectorizer

train_drop_stopwords = []
for pos_review in pos_train:
  sentence_tokens = get_list_tokens(pos_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  train_drop_stopwords.append(new_sentence) 

for neg_review in neg_train:
  sentence_tokens = get_list_tokens(neg_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  train_drop_stopwords.append(new_sentence)

twoGram = CountVectorizer(min_df=1, ngram_range=(2,2))
twoGram_train = twoGram.fit_transform(train_drop_stopwords)
dev_drop_stopwords = []
for pos_review in pos_dev:
  sentence_tokens = get_list_tokens(pos_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  dev_drop_stopwords.append(new_sentence)

for neg_review in neg_dev:
  sentence_tokens = get_list_tokens(neg_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  dev_drop_stopwords.append(new_sentence)

twoGram_dev = twoGram.transform(dev_drop_stopwords)

test_drop_stopwords = []
for pos_review in pos_test:
  sentence_tokens = get_list_tokens(pos_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  test_drop_stopwords.append(new_sentence)

for neg_review in neg_test:
  sentence_tokens = get_list_tokens(neg_review)
  new_sentence = ''
  for i, word in enumerate(sentence_tokens):
    if word in stopwords: continue
    if word not in vocabulary: continue
    if i == 0:
      new_sentence = new_sentence + word
    else:
      new_sentence = new_sentence + ' ' + word
  test_drop_stopwords.append(new_sentence)

twoGram_test = twoGram.transform(test_drop_stopwords)

from gensim.models import Word2Vec

W2V_base = Word2Vec(train_drop_stopwords, min_count=5, size=500, workers=4)
W2V_train = []
for sentence in train_drop_stopwords:
  temp_vector = np.zeros(500)
  count = 0
  for word in sentence:
    try:
      temp_vector += W2V_base[word]
      count += 1
    except:
      pass
  W2V_train.append(temp_vector/count)

W2V_dev = []
for sentence in dev_drop_stopwords:
  temp_vector = np.zeros(500)
  count = 0
  for word in sentence:
    try:
      temp_vector += W2V_base[word]
      count += 1
    except:
      pass
  W2V_dev.append(temp_vector/count)


W2V_test = []
for sentence in test_drop_stopwords:
  temp_vector = np.zeros(500)
  count = 0
  for word in sentence:
    try:
      temp_vector += W2V_base[word]
      count += 1
    except:
      pass
  W2V_test.append(temp_vector/count)

from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

TF_IDF_train_fix = np.asarray(TF_IDF_train)
TF_IDF_dev_fix = np.asarray(TF_IDF_dev)
TF_IDF_test_fix = np.asarray(TF_IDF_test)

Y_train_fix = np.asarray(Y_train)
Y_dev_fix = np.asarray(Y_dev)
Y_test_fix = np.asarray(Y_test)

TF_IDF_select = SelectKBest(chi2, k=500).fit(TF_IDF_train_fix, Y_train_fix)
TF_IDF_train_selected = TF_IDF_select.transform(TF_IDF_train_fix)
TF_IDF_dev_selected = TF_IDF_select.transform(TF_IDF_dev_fix)
TF_IDF_test_selected = TF_IDF_select.transform(TF_IDF_test_fix)

print ("Size original training matrix: "+str(TF_IDF_train_fix.shape))
print ("Size new training matrix: "+str(TF_IDF_train_selected.shape))

twoGram_select = SelectKBest(chi2, k=1000).fit(twoGram_train, Y_train_fix)
twoGram_train_selected = twoGram_select.transform(twoGram_train)
twoGram_dev_selected = twoGram_select.transform(twoGram_dev)
twoGram_test_selected = twoGram_select.transform(twoGram_test)

print ("Size original training matrix: "+str(twoGram_train.shape))
print ("Size new training matrix: "+str(twoGram_train_selected.shape))

W2V_train_fix = np.asarray(W2V_train)
W2V_dev_fix = np.asarray(W2V_dev)
W2V_test_fix = np.asarray(W2V_test)

W2V_select = SelectKBest(f_classif, k=300).fit(W2V_train_fix, Y_train_fix)
W2V_train_selected = W2V_select.transform(W2V_train_fix)
W2V_dev_selected = W2V_select.transform(W2V_dev_fix)
W2V_test_selected = W2V_select.transform(W2V_test_fix)

print ("Size original training matrix: "+str(W2V_train_fix.shape))
print ("Size new training matrix: "+str(W2V_train_selected.shape))

twoGram_train_selected = np.asarray(twoGram_train_selected.todense())
twoGram_dev_selected = np.asarray(twoGram_dev_selected.todense())
twoGram_test_selected = np.asarray(twoGram_test_selected.todense())

COM_train = np.column_stack((TF_IDF_train_selected, twoGram_train_selected, W2V_train_selected))
COM_dev = np.column_stack((TF_IDF_dev_selected, twoGram_dev_selected, W2V_dev_selected))
COM_test = np.column_stack((TF_IDF_test_selected, twoGram_test_selected, W2V_test_selected))

COM_select = SelectKBest(f_classif, k=1000).fit(COM_train, Y_train_fix)
COM_train_selected = COM_select.transform(COM_train)
COM_dev_selected = COM_select.transform(COM_dev)
COM_test_selected = COM_select.transform(COM_test)

print ("Size original training matrix: "+str(COM_train.shape))
print ("Size new training matrix: "+str(COM_train_selected.shape))



Size original training matrix: (15000, 1000)
Size new training matrix: (15000, 500)
Size original training matrix: (15000, 325515)
Size new training matrix: (15000, 1000)
Size original training matrix: (15000, 500)
Size new training matrix: (15000, 300)
Size original training matrix: (15000, 1800)
Size new training matrix: (15000, 1000)


Evaluating Performance

In [43]:

svm_1st = sklearn.svm.SVC(kernel="linear",gamma='auto')
svm_1st.fit(COM_train_selected, Y_train_fix)
Y_dev_pred = svm_1st.predict(COM_dev_selected)
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score

precision = precision_score(Y_dev_fix, Y_dev_pred, average='macro')
recall = recall_score(Y_dev_fix, Y_dev_pred, average='macro')
f1 = f1_score(Y_dev_fix, Y_dev_pred, average='macro')
accuracy = accuracy_score(Y_dev_fix, Y_dev_pred)

print(precision)
print(recall)
print(f1)
print(accuracy)

COM_train_new = np.column_stack((TF_IDF_train, twoGram_train_selected, W2V_train))
COM_dev_new = np.column_stack((TF_IDF_dev, twoGram_dev_selected, W2V_dev))
COM_test_new = np.column_stack((TF_IDF_test, twoGram_test_selected, W2V_test))

COM_select_new = SelectKBest(f_classif, k=1000).fit(COM_train_new, Y_train_fix)
COM_train_selected_new = COM_select_new.transform(COM_train_new)
COM_dev_selected_new = COM_select_new.transform(COM_dev_new)
COM_test_selected_new = COM_select_new.transform(COM_test_new)

print ("Size original training matrix: "+str(COM_train_new.shape))
print ("Size new training matrix: "+str(COM_test_selected_new.shape))
svm_2nd = sklearn.svm.SVC(kernel="linear",gamma='auto')
svm_2nd.fit(COM_train_selected_new, Y_train_fix)

Y_dev_pred = svm_2nd.predict(COM_dev_selected_new)

precision = precision_score(Y_dev_fix, Y_dev_pred, average='macro')
recall = recall_score(Y_dev_fix, Y_dev_pred, average='macro')
f1 = f1_score(Y_dev_fix, Y_dev_pred, average='macro')
accuracy = accuracy_score(Y_dev_fix, Y_dev_pred)

print(precision)
print(recall)
print(f1)
print(accuracy)

Y_test_pred = svm_1st.predict(COM_test_selected)

precision = precision_score(Y_test_fix, Y_test_pred, average='macro')
recall = recall_score(Y_test_fix, Y_test_pred, average='macro')
f1 = f1_score(Y_test_fix, Y_test_pred, average='macro')
accuracy = accuracy_score(Y_test_fix, Y_test_pred)

print('Precision - ',precision)
print('Recall - ',recall)
print('F1 - ',f1)
print('Accuracy ',accuracy)

Y_test_pred = svm_2nd.predict(COM_test_selected_new)

precision = precision_score(Y_test_fix, Y_test_pred, average='macro')
recall = recall_score(Y_test_fix, Y_test_pred, average='macro')
f1 = f1_score(Y_test_fix, Y_test_pred, average='macro')
accuracy = accuracy_score(Y_test_fix, Y_test_pred)

print('Precision - ',precision)
print('Recall - ',recall)
print('F1 - ',f1)
print('Accuracy ',accuracy)

0.8366259745608582
0.8358417300352851
0.835871323117324
0.836
Size original training matrix: (15000, 2500)
Size new training matrix: (5000, 1000)
0.8386029706945002
0.8374014908932879
0.8374164496750252
0.8376
Precision -  0.8392383302377096
Recall -  0.8376138940182231
F1 -  0.8374075946512065
Accuracy  0.8376
Precision -  0.835803264604811
Recall -  0.8342138134742101
F1 -  0.8340058465984154
Accuracy  0.8342
