In [14]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from scipy.sparse import coo_matrix
from collections import Counter

In [15]:
# read training data for IMDB and yelp
IT = pd.read_csv('IMDB-train-prep.txt',sep = '\t',header = None)
YT = pd.read_csv('yelp-train-prep.txt',sep = '\t',header = None)

# read validation data
IV = pd.read_csv('IMDB-valid-prep.txt',sep = '\t',header = None)
YV = pd.read_csv('yelp-valid-prep.txt',sep = '\t',header = None)

# read test data
ITe = pd.read_csv('IMDB-test-prep.txt',sep = '\t',header = None)
YTe = pd.read_csv('yelp-test-prep.txt',sep = '\t',header = None)

# read vocab
Iv = pd.read_csv('IMDB-vocab.txt',sep = '\t',header = None)
Yv = pd.read_csv('yelp-vocab.txt',sep = '\t',header = None)

In [6]:
# def remove_duplicates(l):
#     newlist = []
#     for el in l:
#        if el not in newlist:
#            newlist.append(el)
#     return newlist

# # bag of words
# def bow(df):
# #   extract all non-zero index
#     col = []
#     row = []
#     for i in range(0,len(df[0])):
#         try:
#             review = remove_duplicates(df[0][i].split(' '))
#             col = np.append(col,review)
#             wc = np.full(len(review),i)
#             row = np.append(row, wc)
#         except:
#             continue
    
#     col = list(map(int, col))
#     vMatrix = coo_matrix((np.ones(len(row)), (row, col)), shape=(len(df), 10000))
#     return vMatrix

# # bag of frequency
# def bof(df):
#     col = []
#     row = []
#     data = []
#     for i in range(0,len(df[0])):
#         try:
#             review = df[0][i].split(' ')
#             col = np.append(col,review)
#             wc = np.full(len(review),i)
#             row = np.append(row, wc)
#             data = np.append(data,np.full(len(review),1/len(review)))
#         except:
#             continue
    
#     col = list(map(int, col))
#     vMatrix = coo_matrix((data, (row, col)), shape=(len(df), 10000))
#     return vMatrix

In [16]:
# bag of words
def bow(df):
    vMatrix = np.zeros((len(df), 10000))
    for i in range(0,len(df[0])):
        try:
            review = df.iloc[i,0].split(' ')
            review = list(map(int, review))
            for j in range(0, len(review)):
                vMatrix[i][review[j]] = 1
        except:
            continue
    return vMatrix

# bag of frequency
def bof(df):
    vMatrix = np.zeros((len(df), 10000))
    for i in range(0,len(df[0])):
        try:
            comment = df.iloc[i,0].split(' ')
            comment = list(map(int, comment))
            for j in range(0, len(comment)):
                vMatrix[i][comment[j]] = vMatrix[i][comment[j]]+ 1/len(comment)
        except:
            continue
    return vMatrix

In [17]:
# vectorize yelp data
yTw = bow(YT)
yVw = bow(YV)
yTew = bow(YTe)
yTf = bof(YT)
yVf = bof(YV)
yTef = bof(YTe)

In [18]:
# vectorize IMDB data
iTw = bow(IT)
iVw = bow(IV)
iTew = bow(ITe)
iTf = bof(IT)
iVf = bof(IV)
iTef = bof(ITe)

In [49]:
# random classifier
def rclf(nClass,x):
    result = np.zeros(x.shape[0])
    for i in range(0,x.shape[0]):
        result[i] = np.floor(np.random.random() / (1/nClass)) + 1
    return result

# majority classifier
def mclf(x,y):
    data = Counter(y)
    return np.full(x.shape[0],data.most_common(1)[0][0])

In [20]:
# naive bayes
def nbclf(x,y,a,test):
    clf = BernoulliNB(alpha = a)
    clf.fit(x,y)
    return clf.predict(test)

def ngclf(x,y, test):
    clf = GaussianNB()
    clf.fit(x, y)
    return clf.predict(test)

# linear SVM
def sclf(x,y,c,test):
    clf = LinearSVC(C=c, random_state=0)
    clf.fit(x, y)
    return clf.predict(test)

# decision tree 
def dclf(x,y,md,test):
    clf = DecisionTreeClassifier(max_depth=md, random_state=0)
    clf.fit(x, y)
    return clf.predict(test)

In [50]:
# test random and majority classifier
ry = f1_score(list(YTe[1]), rclf(5,yTew), average='weighted')
my = f1_score(list(YTe[1]), mclf(yTew,YT[1]), average='weighted')

ri = f1_score(list(ITe[1]), rclf(2,iTew), average='weighted')
mi = f1_score(list(ITe[1]), mclf(iTew,IT[1]), average='weighted')

print("Random classifier f1-measure:", ry)
print("Majority classifier f1-measure:", my)
print("Random classifier f1-measure:", ri)
print("Majority classifier f1-measure:", mi)

Random classifier f1-measure: 0.23746167753661265
Majortity classifier f1-measure: 0.18238490007401925
Random classifier f1-measure: 0.2506872783776246
Majortity classifier f1-measure: 0.33333333333333326


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [22]:
# validation
# each parameter is tested with 10 values
numP = 10
nbb = np.zeros(numP)
nbg = np.zeros(numP)
sw = np.zeros(numP)
sf = np.zeros(numP)
dw = np.zeros(numP)
df = np.zeros(numP)

param = 0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000

for i in range(0,numP):
    nbb[i] = f1_score(list(YV[1]), nbclf(yTw,list(YT[1]),param[i],yVw), average='weighted')
#     nbg[i] = f1_score(list(YV[1]), ngclf(yTf,list(YT[1]),param[i],yVf), average='weighted')
    sw[i] = f1_score(list(YV[1]), sclf(yTw,list(YT[1]),param[i],yVw), average='weighted')
    sf[i] = f1_score(list(YV[1]), sclf(yTf,list(YT[1]),param[i],yVf), average='weighted')
    dw[i] = f1_score(list(YV[1]), dclf(yTw,list(YT[1]),param[i],yVw), average='weighted')
    df[i] = f1_score(list(YV[1]), dclf(yTf,list(YT[1]),param[i],yVf), average='weighted')
    
bnbb = np.amax(nbb)
# bnbg = np.amax(nbg)
bnbg = f1_score(list(YV[1]), ngclf(yTf,list(YT[1]),yVf), average='weighted')
bsw = np.amax(sw)
bsf = np.amax(sf)
bdw = np.amax(dw)
bdf = np.amax(df)

  'precision', 'predicted', average, warn_for)


In [26]:
nbb = list(nbb)
sw = list(sw)
sf = list(sf)
dw = list(dw)
df = list(df)

In [28]:
print("Validation f1-measure")
print("Best Bernoulli Naive Bayes validation f1-measure:", bnbb, " Obtained at alpha = ", param[nbb.index(bnbb)])
print("Best Gaussian Naive Bayes validation f1-measure:", bnbg)
print("Best SVM Bag of words validation f1-measure:", bsw, " Obtained at C = ", param[sw.index(bsw)])
print("Best SVM Bag of frequency validation f1-measure:", bsf, " Obtained at C = ", param[sf.index(bsf)])
print("Best Decision Tree Bag of words validation f1-measure:", bdw, " Obtained at max depth of ", param[dw.index(bdw)])
print("Best Decision Tree Bag of frequency validation f1-measure:", bdf, " Obtained at max depth of ", param[df.index(bdf)])

print("Training f1-measure")
print("Bernoulli Naive Bayes training f1-measure at best alpha:",f1_score(list(YT[1]), nbclf(yTw,list(YT[1]),param[nbb.index(bnbb)],yTw), average='weighted') )
print("Gaussian Naive Bayes training f1-measure:",f1_score(list(YT[1]), ngclf(yTf,list(YT[1]),yTf), average='weighted'))
print("SVM Bag of words training f1-measure at best C:",f1_score(list(YT[1]), sclf(yTw,list(YT[1]),param[sw.index(bsw)],yTw), average='weighted'))
print("SVM Bag of frequency training f1-measure at best C:",f1_score(list(YT[1]), sclf(yTf,list(YT[1]),param[sf.index(bsf)],yTf), average='weighted'))
print("Decision Tree Bag of words training f1-measure at best max depth:",f1_score(list(YT[1]), dclf(yTw,list(YT[1]),param[dw.index(bdw)],yTw), average='weighted'))
print("Decision Tree Bag of frequency training f1-measure at best max depth:", f1_score(list(YT[1]), dclf(yTf,list(YT[1]),param[df.index(bdf)],yTf), average='weighted'))

print("Test f1-measure")
print("Bernoulli Naive Bayes test f1-measure at best alpha:",f1_score(list(YTe[1]), nbclf(yTw,list(YT[1]),param[nbb.index(bnbb)],yTew), average='weighted') )
print("Gaussian Naive Bayes test f1-measure:",f1_score(list(YTe[1]), ngclf(yTf,list(YT[1]),yTef), average='weighted'))
print("SVM Bag of words test f1-measure at best C:",f1_score(list(YTe[1]), sclf(yTw,list(YT[1]),param[sw.index(bsw)],yTew), average='weighted'))
print("SVM Bag of frequency test f1-measure at best C:",f1_score(list(YTe[1]), sclf(yTf,list(YT[1]),param[sf.index(bsf)],yTef), average='weighted'))
print("Decision Tree Bag of words test f1-measure at best max depth:",f1_score(list(YTe[1]), dclf(yTw,list(YT[1]),param[dw.index(bdw)],yTew), average='weighted'))
print("Decision Tree Bag of frequency test f1-measure at best max depth:", f1_score(list(YTe[1]), dclf(yTf,list(YT[1]),param[df.index(bdf)],yTef), average='weighted'))

Validation f1-measure
Best Bernoulli Naive Bayes validation f1-measure: 0.413109637653621  Obtained at alpha =  0.01
Best Gaussian Naive Bayes validation f1-measure: 0.29237439463776815
Best SVM Bag of words validation f1-measure: 0.48780633736872003  Obtained at C =  0.01
Best SVM Bag of frequency validation f1-measure: 0.4882976993661879  Obtained at C =  100
Best Decision Tree Bag of words validation f1-measure: 0.34872356922850845  Obtained at max depth of  10
Best Decision Tree Bag of frequency validation f1-measure: 0.3907368652210626  Obtained at max depth of  10
Training f1-measure
Bernoulli Naive Bayes training f1-measure at best alpha: 0.7502393423348799
Gaussian Naive Bayes training f1-measure: 0.8077916059661376
SVM Bag of words training f1-measure at best C: 0.8408856489275267
SVM Bag of frequency training f1-measure at best C: 0.8795874257831433
Decision Tree Bag of words training f1-measure at best max depth: 0.4981599437858736
Decision Tree Bag of frequency training f1-

In [29]:
# validation
# each parameter is tested with 100000 values
numP = 10
nbb = np.zeros(numP)
nbg = np.zeros(numP)
sw = np.zeros(numP)
sf = np.zeros(numP)
dw = np.zeros(numP)
df = np.zeros(numP)

param = 0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000

# naive bayes
for i in range(0,numP):
    nbb[i] = f1_score(list(IV[1]), nbclf(iTw,list(IT[1]),param[i],iVw), average='weighted')
#     nbg[i] = f1_score(list(IV[1]), ngclf(iTf,list(IT[1]),param[i],iVf), average='weighted')
    sw[i] = f1_score(list(IV[1]), sclf(iTw,list(IT[1]),param[i],iVw), average='weighted')
    sf[i] = f1_score(list(IV[1]), sclf(iTf,list(IT[1]),param[i],iVf), average='weighted')
    dw[i] = f1_score(list(IV[1]), dclf(iTw,list(IT[1]),param[i],iVw), average='weighted')
    df[i] = f1_score(list(IV[1]), dclf(iTf,list(IT[1]),param[i],iVf), average='weighted')
    
bnbb = np.amax(nbb)
# bnbg = np.amax(nbg)
bnbg = f1_score(list(IV[1]), ngclf(iTf,list(IT[1]),iVf), average='weighted')
bsw = np.amax(sw)
bsf = np.amax(sf)
bdw = np.amax(dw)
bdf = np.amax(df)

  'precision', 'predicted', average, warn_for)


In [31]:
nbb = list(nbb)
sw = list(sw)
sf = list(sf)
dw = list(dw)
df = list(df)

In [32]:
print("Validation f-measure")
print("Best Bernoulli Naive Bayes validation f-measure:", bnbb, " Obtained at alpha = ", param[nbb.index(bnbb)])
print("Best Gaussian Naive Bayes validation f-measure:", bnbg)
print("Best SVM Bag of words validation f-measure:", bsw, " Obtained at C = ", param[sw.index(bsw)])
print("Best SVM Bag of frequency validation f-measure:", bsf, " Obtained at C = ", param[sf.index(bsf)])
print("Best Decision Tree Bag of words validation f-measure:", bdw, " Obtained at max depth of ", param[dw.index(bdw)])
print("Best Decision Tree Bag of frequency validation f-measure:", bdf, " Obtained at max depth of ", param[df.index(bdf)])

print("Training f-measure")
print("Bernoulli Naive Bayes training f-measure at best alpha:",f1_score(list(IT[1]), nbclf(iTw,list(IT[1]),param[nbb.index(bnbb)],iTw), average='weighted') )
print("Gaussian Naive Bayes training f-measure:",f1_score(list(IT[1]), ngclf(iTf,list(IT[1]),iTf), average='weighted'))
print("SVM Bag of words training f-measure at best C:",f1_score(list(IT[1]), sclf(iTw,list(IT[1]),param[sw.index(bsw)],iTw), average='weighted'))
print("SVM Bag of frequency training f-measure at best C:",f1_score(list(IT[1]), sclf(iTf,list(IT[1]),param[sf.index(bsf)],iTf), average='weighted'))
print("Decision Tree Bag of words training f-measure at best max depth:",f1_score(list(IT[1]), dclf(iTw,list(IT[1]),param[dw.index(bdw)],iTw), average='weighted'))
print("Decision Tree Bag of frequency training f-measure at best max depth:", f1_score(list(IT[1]), dclf(iTf,list(IT[1]),param[df.index(bdf)],iTf), average='weighted'))

print("Test f-measure")
print("Bernoulli Naive Bayes test f-measure at best alpha:",f1_score(list(ITe[1]), nbclf(iTw,list(IT[1]),param[i],iTew), average='weighted') )
print("Gaussian Naive Bayes test f-measure:",f1_score(list(ITe[1]), ngclf(iTf,list(IT[1]),iTef), average='weighted'))
print("SVM Bag of words test f-measure at best C:",f1_score(list(ITe[1]), sclf(iTw,list(IT[1]),param[sw.index(bsw)],iTew), average='weighted'))
print("SVM Bag of frequency test f-measure at best C:",f1_score(list(ITe[1]), sclf(iTf,list(IT[1]),param[sf.index(bsf)],iTef), average='weighted'))
print("Decision Tree Bag of words test f-measure at best max depth:",f1_score(list(ITe[1]), dclf(iTw,list(IT[1]),param[dw.index(bdw)],iTew), average='weighted'))
print("Decision Tree Bag of frequency test f-measure at best max depth:", f1_score(list(ITe[1]), dclf(iTf,list(IT[1]),param[df.index(bdf)],iTef), average='weighted'))

Validation f-measure
Best Bernoulli Naive Bayes validation f-measure: 0.8426650700722068  Obtained at alpha =  0.1
Best Gaussian Naive Bayes validation f-measure: 0.7513922734965905
Best SVM Bag of words validation f-measure: 0.8747957881303126  Obtained at C =  0.01
Best SVM Bag of frequency validation f-measure: 0.8785929875309597  Obtained at C =  100
Best Decision Tree Bag of words validation f-measure: 0.7105956664444624  Obtained at max depth of  10
Best Decision Tree Bag of frequency validation f-measure: 0.701489066096874  Obtained at max depth of  10
Training f-measure
Bernoulli Naive Bayes training f-measure at best alpha: 0.870294063564018
Gaussian Naive Bayes training f-measure: 0.8621070566038361
SVM Bag of words training f-measure at best C: 0.9632663360636912
SVM Bag of frequency training f-measure at best C: 0.9460661371559866
Decision Tree Bag of words training f-measure at best max depth: 0.7594146771119565
Decision Tree Bag of frequency training f-measure at best max