## 1 Set up Environment in Google Colab

Run the following cells to install/upgrade the required packages and check if the installed versions meet the requirements.

In [15]:
import pandas as pd
from sklearn import feature_selection
import copy as cp
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics

In [1]:
# make sure the required python packages are installed

# install nltk (we'll use 3.6.7 in Spring 2022)
!pip install nltk==3.6.7 --upgrade

# install spacy (we'll use 3.2.1 in Spring 2022)
!pip install spacy==3.2.1 --upgrade

# upgrade scikit-learn 0.24.2
!pip install scikit-learn==0.24.2 --upgrade

# download the spacy en_core_web_sm model (3.2.0 version)
!python -m spacy download en_core_web_sm-3.2.0 --direct

Collecting scikit-learn==0.24.2
  Downloading scikit_learn-0.24.2-cp38-cp38-manylinux2010_x86_64.whl (24.9 MB)
[K     |████████████████████████████████| 24.9 MB 5.4 MB/s eta 0:00:01
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.1
    Uninstalling scikit-learn-0.24.1:
      Successfully uninstalled scikit-learn-0.24.1
Successfully installed scikit-learn-0.24.2
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 4.4 MB/s eta 0:00:01    |███████████████▋                | 6.8 MB 1.5 MB/s eta 0:00:05


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## 2 Explore the Dataset by Pang et al. (2002)

Download the raw dataset at:
http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens.zip

In [16]:
import spacy
import sklearn
import sklearn.metrics
from collections import Counter, OrderedDict
import numpy as np
from sklearn.feature_extraction import DictVectorizer

nlp = spacy.load( "en_core_web_sm", disable=["parser", "ner"] )

# please use this function to get unigram features
def text2unigrams( rawtext, nlp ):
    return [ '[OOV]' if token.is_stop or token.is_punct else token.lemma_.lower() for token in nlp(rawtext) ]

# please use this function to get bigram features
def unigrams2bigrams( unigrams ):
    return [ unigrams[i]+'_'+unigrams[i+1] for i in range(len(unigrams)-1) if unigrams[i]!='[OOV]' and unigrams[i+1]!='[OOV]' ]

def unigramsplusbigrams( unigrams ):
    return unigrams+[ unigrams[i]+'_'+unigrams[i+1] for i in range(len(unigrams)-1) if unigrams[i]!='[OOV]' and unigrams[i+1]!='[OOV]' ]



In [17]:
import pandas as pd

data = pd.read_csv( 'pang2002.csv', index_col=0 )

data['unigrams'] = [ text2unigrams(text, nlp) for text in data['text'] ]
data['bigrams'] = [ unigrams2bigrams(unigrams) for unigrams in data['unigrams'] ]
data['unibigrams'] = [ unigramsplusbigrams(unigrams) for unigrams in data['unigrams'] ]

data

Unnamed: 0,fold,label,text,unigrams,bigrams,unibigrams
cv004_tok-29856.txt,1,pos,"all great things come to an end , and the dot-...","[[OOV], great, thing, come, [OOV], [OOV], end,...","[great_thing, thing_come, com_era, era_embody,...","[[OOV], great, thing, come, [OOV], [OOV], end,..."
cv409_tok-11193.txt,2,pos,i'm not quite sure how best to go about writin...,"[[OOV], [OOV], [OOV], [OOV], sure, [OOV], good...","[little_disappointed, barry_levinson, politica...","[[OOV], [OOV], [OOV], [OOV], sure, [OOV], good..."
cv045_tok-29121.txt,1,pos,"the others ( 2001 ) nicole kidman , christophe...","[[OOV], [OOV], [OOV], 2001, [OOV], nicole, kid...","[nicole_kidman, christopher_eccleston, fionnul...","[[OOV], [OOV], [OOV], 2001, [OOV], nicole, kid..."
cv279_tok-15969.txt,2,pos,director : tony scott writer : david marconi s...,"[director, [OOV], tony, scott, writer, [OOV], ...","[tony_scott, scott_writer, david_marconi, marc...","[director, [OOV], tony, scott, writer, [OOV], ..."
cv387_tok-4672.txt,2,pos,one of the most entertaining james bond films ...,"[[OOV], [OOV], [OOV], [OOV], entertaining, jam...","[entertaining_james, james_bond, bond_film, ro...","[[OOV], [OOV], [OOV], [OOV], entertaining, jam..."
...,...,...,...,...,...,...
cv562_tok-26379.txt,3,neg,directed by : jan de bont written by : david s...,"[direct, [OOV], [OOV], jan, de, bont, write, [...","[jan_de, de_bont, bont_write, david_shelf, shi...","[direct, [OOV], [OOV], jan, de, bont, write, [..."
cv000_tok-9611.txt,1,neg,"tristar / 1 : 30 / 1997 / r ( language , viole...","[tristar, [OOV], 1, [OOV], 30, [OOV], 1997, [O...","[dennis_rodman, claude_van, van_damme, mickey_...","[tristar, [OOV], 1, [OOV], 30, [OOV], 1997, [O..."
cv571_tok-11568.txt,3,neg,director : michael caton-jones writer : chuck ...,"[director, [OOV], michael, caton, [OOV], jones...","[michael_caton, jones_writer, chuck_pfarrer, k...","[director, [OOV], michael, caton, [OOV], jones..."
cv210_tok-15092.txt,1,neg,wrongfully accused reviewed by jamie peck<hr>r...,"[wrongfully, accuse, review, [OOV], jamie, pec...","[wrongfully_accuse, accuse_review, jamie_peck,...","[wrongfully, accuse, review, [OOV], jamie, pec..."


In [18]:
#Settings 1
##divide into k fold
fold1 = data[data.fold == 1]
fold2 = data[data.fold == 2]
fold3 = data[data.fold == 3]
print(len(fold1)," ",len(fold2)," ",len(fold3))

def prepareDataset(data, dict, updateDict,TEXT):
    X = dict.fit_transform( Counter(text) for text in data[TEXT] ) if updateDict else dict.transform( Counter(text) for text in data[TEXT] )
    Y = np.array(data['label'])
    return X, Y


def select_dict( dict, top, X, Y ):
    top = min( top, X.shape[1] )
    fsel = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top )
    X_selected = fsel.fit_transform( X, Y )
    dict_selected = cp.deepcopy(dict).restrict( fsel.get_support() )
    return dict_selected, X_selected




def accuracy(trn,test,label,freq,n):
    dict = DictVectorizer()
    training_X, training_Y = prepareDataset(trn, dict, updateDict=True, TEXT=label)
    #a = pd.DataFrame( training_X.toarray(), columns = dict.get_feature_names() )
    top = int(n)
    dict_selected, training_X_selected = select_dict( dict, top, training_X, training_Y)
    testing_X_selected, testing_Y = prepareDataset( test, dict_selected, updateDict=False,TEXT=label)
    if freq == 0:
        training_X_selected = training_X_selected.astype(bool).astype(int)
        testing_X_selected = testing_X_selected.astype(bool).astype(int)
    classifier = sklearn.naive_bayes.MultinomialNB()
    classifier.fit( training_X_selected, training_Y )
    testing_Y_pred = classifier.predict(testing_X_selected)
    a = sklearn.metrics.accuracy_score( testing_Y, testing_Y_pred )
    return a

##train fold(2,3), test fold(1)
uf, up, bf, bp, ubf, ubp = 0,0,0,0,0,0
tr = fold2.append(fold3, ignore_index= True)
ts = fold1
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf," ",up," ",bf," ",bp," ",ubf," ",ubp)

##train fold(1,3), test fold(2)
tr = fold1.append(fold3, ignore_index= True)
ts = fold2
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf," ",up," ",bf," ",bp," ",ubf," ",ubp)

##train fold(1,2), test fold(3)
tr = fold1.append(fold2, ignore_index= True)
ts = fold3
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
print(uf/3," ",up/3," ",bf/3," ",bp/3," ",ubf/3," ",ubp/3)

feature,fp,accu,myacc = [],[],[],[]
feature.append("Unigrams")
feature.append("Unigrams")
feature.append("Bigrams")
feature.append("Bigrams")
feature.append("Unigrams+Bigrams")
feature.append("Unigrams+Bigrams")

fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")

accu.append(0.7728)
accu.append(0.7893)
accu.append(0.7036)
accu.append(0.6885)
accu.append(0.7742)
accu.append(0.7971)

myacc.append(uf/3)
myacc.append(up/3)
myacc.append(bf/3)
myacc.append(bp/3)
myacc.append(ubf/3)
myacc.append(ubp/3)

final = pd.DataFrame()
final['Features'] = feature
final['Fres./Pres.'] = fp
final['Accuracy'] = accu
final['My Accuracy'] = myacc
print(final)

466   466   468
0.7685399166085861   0.7956788085543449   0.7035508602032207   0.6971130919628773   0.7756838218211609   0.797833901911155
           Features Fres./Pres.  Accuracy  My Accuracy
0          Unigrams        Fres    0.7728     0.768540
1          Unigrams        Pres    0.7893     0.795679
2           Bigrams        Fres    0.7036     0.703551
3           Bigrams        Pres    0.6885     0.697113
4  Unigrams+Bigrams        Fres    0.7742     0.775684
5  Unigrams+Bigrams        Pres    0.7971     0.797834


In [19]:
#Settings 2
##divide into k fold
fold1 = data[data.fold == 1]
fold2 = data[data.fold == 2]
fold3 = data[data.fold == 3]
print(len(fold1)," ",len(fold2)," ",len(fold3))

def prepareDataset(data, dict, updateDict,TEXT):
    X = dict.fit_transform( Counter(text) for text in data[TEXT] ) if updateDict else dict.transform( Counter(text) for text in data[TEXT] )
    Y = np.array(data['label'])
    return X, Y


def select_dict( dict, top, X, Y ):
    top = min( top, X.shape[1] )
    fsel = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top )
    X_selected = fsel.fit_transform( X, Y )
    dict_selected = cp.deepcopy(dict).restrict( fsel.get_support() )
    return dict_selected, X_selected



def accuracy(trn,test,label,freq,n):
    dict = DictVectorizer()
    training_X, training_Y = prepareDataset(trn, dict, updateDict=True, TEXT=label)
    #a = pd.DataFrame( training_X.toarray(), columns = dict.get_feature_names() )
    top = int(n)
    dict_selected, training_X_selected = select_dict( dict, top, training_X, training_Y)
    testing_X_selected, testing_Y = prepareDataset( test, dict_selected, updateDict=False,TEXT=label)
    #print(testing_X_selected.shape)
    if freq == 0:
        training_X_selected = training_X_selected.astype(bool).astype(int)
        testing_X_selected = testing_X_selected.astype(bool).astype(int)
    classifier = sklearn.naive_bayes.MultinomialNB()
    classifier.fit( training_X_selected, training_Y )
    testing_Y_pred = classifier.predict(testing_X_selected)
    a = sklearn.metrics.accuracy_score( testing_Y, testing_Y_pred )
    return a

##train fold(1,2), test fold(1)
uf, up, bf, bp, ubf, ubp = 0,0,0,0,0,0
tr = fold1.append(fold2, ignore_index= True)
ts = fold1
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf," ",up," ",bf," ",bp," ",ubf," ",ubp)

##train fold(2,3), test fold(2)
tr = fold2.append(fold3, ignore_index= True)
ts = fold2
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf," ",up," ",bf," ",bp," ",ubf," ",ubp)

##train fold(1,3), test fold(3)
tr = fold1.append(fold3, ignore_index= True)
ts = fold3
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf/3," ",up/3," ",bf/3," ",bp/3," ",ubf/3," ",ubp/3)

feature,fp,accu,myacc = [],[],[],[]
feature.append("Unigrams")
feature.append("Unigrams")
feature.append("Bigrams")
feature.append("Bigrams")
feature.append("Unigrams+Bigrams")
feature.append("Unigrams+Bigrams")

fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")

accu.append(0.9900)
accu.append(0.9971)
accu.append(0.9979)
accu.append(0.9986)
accu.append(0.9964)
accu.append(0.9993)

myacc.append(uf/3)
myacc.append(up/3)
myacc.append(bf/3)
myacc.append(bp/3)
myacc.append(ubf/3)
myacc.append(ubp/3)

final = pd.DataFrame()
final['Features'] = feature
final['Fres./Pres.'] = fp
final['Accuracy'] = accu
final['My Accuracy'] = myacc
print(final)

466   466   468
           Features Fres./Pres.  Accuracy  My Accuracy
0          Unigrams        Fres    0.9900     0.989995
1          Unigrams        Pres    0.9971     0.997142
2           Bigrams        Fres    0.9979     0.997857
3           Bigrams        Pres    0.9986     0.998569
4  Unigrams+Bigrams        Fres    0.9964     0.992135
5  Unigrams+Bigrams        Pres    0.9993     0.996427


In [20]:
#Settings 3
##divide into k fold
fold1 = data[data.fold == 1]
fold2 = data[data.fold == 2]
fold3 = data[data.fold == 3]
print(len(fold1)," ",len(fold2)," ",len(fold3))

def prepareDataset(data, dict, updateDict,TEXT):
    X = dict.fit_transform( Counter(text) for text in data[TEXT] ) if updateDict else dict.transform( Counter(text) for text in data[TEXT] )
    Y = np.array(data['label'])
    return X, Y


def select_dict( dict, top, X, Y ):
    top = min( top, X.shape[1] )
    fsel = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top )
    X_selected = fsel.fit_transform( X, Y )
    dict_selected = cp.deepcopy(dict).restrict( fsel.get_support() )
    return dict_selected, X_selected



def accuracy(trn,test,label,freq,n):
    dict = DictVectorizer()
    training_X, training_Y = prepareDataset(trn, dict, updateDict=True, TEXT=label)
    #a = pd.DataFrame( training_X.toarray(), columns = dict.get_feature_names() )
    top = int(n)
    dict_selected, training_X_selected = select_dict( dict, top, training_X, training_Y)
    testing_X_selected, testing_Y = prepareDataset( test, dict_selected, updateDict=False,TEXT=label)
    #print(testing_X_selected.shape)
    if freq == 0:
        training_X_selected = training_X_selected.astype(bool).astype(int)
        testing_X_selected = testing_X_selected.astype(bool).astype(int)
    classifier = sklearn.naive_bayes.MultinomialNB()
    classifier.fit( training_X_selected, training_Y )
    testing_Y_pred = classifier.predict(testing_X_selected)
    a = sklearn.metrics.accuracy_score( testing_Y, testing_Y_pred )
    return a

##train fold(1), test fold(1)
uf, up, bf, bp, ubf, ubp = 0,0,0,0,0,0
tr = fold1
ts = fold1
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf," ",up," ",bf," ",bp," ",ubf," ",ubp)

##train fold(2), test fold(2)
tr = fold2
ts = fold2
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf," ",up," ",bf," ",bp," ",ubf," ",ubp)

##train fold(3), test fold(3)
tr = fold3
ts = fold3
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf/3," ",up/3," ",bf/3," ",bp/3," ",ubf/3," ",ubp/3)

feature,fp,accu,myacc = [],[],[],[]
feature.append("Unigrams")
feature.append("Unigrams")
feature.append("Bigrams")
feature.append("Bigrams")
feature.append("Unigrams+Bigrams")
feature.append("Unigrams+Bigrams")

fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")

accu.append(0.9979)
accu.append(0.9993)
accu.append(1.0000)
accu.append(1.0000)
accu.append(0.9993)
accu.append(1.0000)

myacc.append(uf/3)
myacc.append(up/3)
myacc.append(bf/3)
myacc.append(bp/3)
myacc.append(ubf/3)
myacc.append(ubp/3)

final = pd.DataFrame()
final['Features'] = feature
final['Fres./Pres.'] = fp
final['Accuracy'] = accu
final['My Accuracy'] = myacc
print(final)

466   466   468
           Features Fres./Pres.  Accuracy  My Accuracy
0          Unigrams        Fres    0.9979     0.997142
1          Unigrams        Pres    0.9993     0.999288
2           Bigrams        Fres    1.0000     1.000000
3           Bigrams        Pres    1.0000     1.000000
4  Unigrams+Bigrams        Fres    0.9993     0.998569
5  Unigrams+Bigrams        Pres    1.0000     0.999285


In [21]:
#Settings 4
##divide into k fold
fold1 = data[data.fold == 1]
fold2 = data[data.fold == 2]
fold3 = data[data.fold == 3]
print(len(fold1)," ",len(fold2)," ",len(fold3))

def prepareDataset(data, dict, updateDict,TEXT):
    X = dict.fit_transform( Counter(text) for text in data[TEXT] ) if updateDict else dict.transform( Counter(text) for text in data[TEXT] )
    Y = np.array(data['label'])
    return X, Y


def select_dict( dict, top, X, Y ):
    top = min( top, X.shape[1] )
    fsel = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top )
    X_selected = fsel.fit_transform( X, Y )
    dict_selected = cp.deepcopy(dict).restrict( fsel.get_support() )
    return dict_selected, X_selected



def accuracy(trn,test,label,freq,n):
    dict = DictVectorizer()
    training_X, training_Y = prepareDataset(trn, dict, updateDict=True, TEXT=label)
    #a = pd.DataFrame( training_X.toarray(), columns = dict.get_feature_names() )
    top = int(n)
    dict_selected, training_X_selected = select_dict( dict, top, training_X, training_Y)
    testing_X_selected, testing_Y = prepareDataset( test, dict_selected, updateDict=False,TEXT=label)
    #print(testing_X_selected.shape)
    if freq == 0:
        training_X_selected = training_X_selected.astype(bool).astype(int)
        testing_X_selected = testing_X_selected.astype(bool).astype(int)
    classifier = sklearn.naive_bayes.MultinomialNB()
    classifier.fit( training_X_selected, training_Y )
    testing_Y_pred = classifier.predict(testing_X_selected)
    a = sklearn.metrics.accuracy_score( testing_Y, testing_Y_pred )
    return a

##train fold(2), test fold(1)
uf, up, bf, bp, ubf, ubp = 0,0,0,0,0,0
tr = fold2
ts = fold1
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf," ",up," ",bf," ",bp," ",ubf," ",ubp)

##train fold(3), test fold(2)
tr = fold3
ts = fold2
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
#print(uf," ",up," ",bf," ",bp," ",ubf," ",ubp)

##train fold(1), test fold(3)
tr = fold1
ts = fold3
uf  += accuracy(tr,ts,'unigrams',1,16165)
up  += accuracy(tr,ts,'unigrams',0,16165)
bf  += accuracy(tr,ts,'bigrams',1,16165)
bp  += accuracy(tr,ts,'bigrams',0,16165)
ubf  += accuracy(tr,ts,'unibigrams',1,16165)
ubp  += accuracy(tr,ts,'unibigrams',0,16165)
print(uf/3," ",up/3," ",bf/3," ",bp/3," ",ubf/3," ",ubp/3)

feature,fp,accu,myacc = [],[],[],[]
feature.append("Unigrams")
feature.append("Unigrams")
feature.append("Bigrams")
feature.append("Bigrams")
feature.append("Unigrams+Bigrams")
feature.append("Unigrams+Bigrams")

fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")

accu.append(0.7307)
accu.append(0.7714)
accu.append(0.6485)
accu.append(0.6393)
accu.append(0.7321)
accu.append(0.7557)

myacc.append(uf/3)
myacc.append(up/3)
myacc.append(bf/3)
myacc.append(bp/3)
myacc.append(ubf/3)
myacc.append(ubp/3)

final = pd.DataFrame()
final['Features'] = feature
final['Fres./Pres.'] = fp
final['Accuracy'] = accu
final['My Accuracy'] = myacc
print(final)

466   466   468
0.736409155937053   0.7692766222809141   0.6485485981194136   0.6499547583238571   0.73140505973124   0.7471326559309391
           Features Fres./Pres.  Accuracy  My Accuracy
0          Unigrams        Fres    0.7307     0.736409
1          Unigrams        Pres    0.7714     0.769277
2           Bigrams        Fres    0.6485     0.648549
3           Bigrams        Pres    0.6393     0.649955
4  Unigrams+Bigrams        Fres    0.7321     0.731405
5  Unigrams+Bigrams        Pres    0.7557     0.747133


## 4 Implement P2

In [22]:
#Settings 5
fold1 = data[data.fold == 1]
fold2 = data[data.fold == 2]
fold3 = data[data.fold == 3]

def prepareDataset(data, dict, updateDict,TEXT):
    X = dict.fit_transform( Counter(text) for text in data[TEXT] ) if updateDict else dict.transform( Counter(text) for text in data[TEXT] )
    Y = np.array(data['label'])
    return X, Y


def select_dict( dict, top, X, Y ):
    top = min( top, X.shape[1] )
    fsel = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top )
    X_selected = fsel.fit_transform( X, Y )
    dict_selected = cp.deepcopy(dict).restrict( fsel.get_support() )
    return dict_selected, X_selected


def accuracy(trn,test,label,freq,n):
    dict = DictVectorizer()
    training_X, training_Y = prepareDataset(trn, dict, updateDict=True, TEXT=label)
    #a = pd.DataFrame( training_X.toarray(), columns = dict.get_feature_names() )
    top = int(n)
    dict_selected, training_X_selected = select_dict( dict, top, training_X, training_Y)
    testing_X_selected, testing_Y = prepareDataset( test, dict_selected, updateDict=False,TEXT=label)
    #print(testing_X_selected.shape)
    if freq == 0:
        training_X_selected = training_X_selected.astype(bool).astype(int)
        testing_X_selected = testing_X_selected.astype(bool).astype(int)
    classifier = sklearn.naive_bayes.MultinomialNB()
    classifier.fit( training_X_selected, training_Y )
    testing_Y_pred = classifier.predict(testing_X_selected)
    a = sklearn.metrics.accuracy_score( testing_Y, testing_Y_pred )
    return a

def findk(tr,vl,label,fres):
    mxa = 0.00
    mxk = 2000
    k = 2000
    while k <= 20000:
        a = accuracy(tr,vl,label,fres,k)
        if a >= mxa:
            mxk = k
            mxa = a
        k += 2000
    return mxk


tr,ts,vl = fold2,fold1,fold3
uf, up, bf, bp, ubf, ubp = 0,0,0,0,0,0
k = findk(tr,vl,"unigrams",1)
uf += accuracy(tr,ts,'unigrams',1,k)

k = findk(tr,vl,"unigrams",0)
up += accuracy(tr,ts,'unigrams',0,k)

k = findk(tr,vl,"bigrams",1)
bf += accuracy(tr,ts,'bigrams',1,k)

k = findk(tr,vl,"bigrams",0)
bp += accuracy(tr,ts,'bigrams',0,k)

k = findk(tr,vl,"unibigrams",1)
ubf += accuracy(tr,ts,'unibigrams',1,k)

k = findk(tr,vl,"unibigrams",0)
ubp += accuracy(tr,ts,'unibigrams',0,k)




tr,ts,vl = fold3,fold2,fold1
k = findk(tr,vl,"unigrams",1)
uf += accuracy(tr,ts,'unigrams',1,k)

k = findk(tr,vl,"unigrams",0)
up += accuracy(tr,ts,'unigrams',0,k)

k = findk(tr,vl,"bigrams",1)
bf += accuracy(tr,ts,'bigrams',1,k)

k = findk(tr,vl,"bigrams",0)
bp += accuracy(tr,ts,'bigrams',0,k)

k = findk(tr,vl,"unibigrams",1)
ubf += accuracy(tr,ts,'unibigrams',1,k)

k = findk(tr,vl,"unibigrams",0)
ubp += accuracy(tr,ts,'unibigrams',0,k)




tr,ts,vl = fold1,fold3,fold2
k = findk(tr,vl,"unigrams",1)
uf += accuracy(tr,ts,'unigrams',1,k)

k = findk(tr,vl,"unigrams",0)
up += accuracy(tr,ts,'unigrams',0,k)

k = findk(tr,vl,"bigrams",1)
bf += accuracy(tr,ts,'bigrams',1,k)

k = findk(tr,vl,"bigrams",0)
bp += accuracy(tr,ts,'bigrams',0,k)

k = findk(tr,vl,"unibigrams",1)
ubf += accuracy(tr,ts,'unibigrams',1,k)

k = findk(tr,vl,"unibigrams",0)
ubp += accuracy(tr,ts,'unibigrams',0,k)


feature,fp,accu,myacc = [],[],[],[]
feature.append("Unigrams")
feature.append("Unigrams")
feature.append("Bigrams")
feature.append("Bigrams")
feature.append("Unigrams+Bigrams")
feature.append("Unigrams+Bigrams")

fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")

accu.append(0.7343)
accu.append(0.7671)
accu.append(0.6414)
accu.append(0.6443)
accu.append(0.7371)
accu.append(0.7572)

myacc.append(uf/3)
myacc.append(up/3)
myacc.append(bf/3)
myacc.append(bp/3)
myacc.append(ubf/3)
myacc.append(ubp/3)

final = pd.DataFrame()
final['Features'] = feature
final['Fres./Pres.'] = fp
final['Accuracy'] = accu
final['My Accuracy'] = myacc
print(final)

           Features Fres./Pres.  Accuracy  My Accuracy
0          Unigrams        Fres    0.7343     0.737828
1          Unigrams        Pres    0.7671     0.754992
2           Bigrams        Fres    0.6414     0.648549
3           Bigrams        Pres    0.6443     0.645675
4  Unigrams+Bigrams        Fres    0.7371     0.735697
5  Unigrams+Bigrams        Pres    0.7572     0.765003


In [23]:
#Settings 6
fold1 = data[data.fold == 1]
fold2 = data[data.fold == 2]
fold3 = data[data.fold == 3]

def prepareDataset(data, dict, updateDict,TEXT):
    X = dict.fit_transform( Counter(text) for text in data[TEXT] ) if updateDict else dict.transform( Counter(text) for text in data[TEXT] )
    Y = np.array(data['label'])
    return X, Y


def select_dict( dict, top, X, Y ):
    top = min( top, X.shape[1] )
    fsel = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top )
    X_selected = fsel.fit_transform( X, Y )
    dict_selected = cp.deepcopy(dict).restrict( fsel.get_support() )
    return dict_selected, X_selected


def accuracy(trn,test,label,freq,n):
    dict = DictVectorizer()
    training_X, training_Y = prepareDataset(trn, dict, updateDict=True, TEXT=label)
    #a = pd.DataFrame( training_X.toarray(), columns = dict.get_feature_names() )
    top = int(n)
    dict_selected, training_X_selected = select_dict( dict, top, training_X, training_Y)
    testing_X_selected, testing_Y = prepareDataset( test, dict_selected, updateDict=False,TEXT=label)
    #print(testing_X_selected.shape)
    if freq == 0:
        training_X_selected = training_X_selected.astype(bool).astype(int)
        testing_X_selected = testing_X_selected.astype(bool).astype(int)
    classifier = sklearn.naive_bayes.MultinomialNB()
    classifier.fit( training_X_selected, training_Y )
    testing_Y_pred = classifier.predict(testing_X_selected)
    a = sklearn.metrics.accuracy_score( testing_Y, testing_Y_pred )
    return a

def findk(tr,vl,label,fres):
    mxa = 0.00
    mxk = 2000
    k = 2000
    while k <= 20000:
        a = accuracy(tr,vl,label,fres,k)
        if a >= mxa:
            mxk = k
            mxa = a
        k += 2000
    print("best k ",mxk)
    return mxk


tr,ts,vl = fold2,fold1,fold1
uf, up, bf, bp, ubf, ubp = 0,0,0,0,0,0
k = findk(tr,vl,"unigrams",1)
uf += accuracy(tr,ts,'unigrams',1,k)

k = findk(tr,vl,"unigrams",0)
up += accuracy(tr,ts,'unigrams',0,k)

k = findk(tr,vl,"bigrams",1)
bf += accuracy(tr,ts,'bigrams',1,k)

k = findk(tr,vl,"bigrams",0)
bp += accuracy(tr,ts,'bigrams',0,k)

k = findk(tr,vl,"unibigrams",1)
ubf += accuracy(tr,ts,'unibigrams',1,k)

k = findk(tr,vl,"unibigrams",0)
ubp += accuracy(tr,ts,'unibigrams',0,k)




tr,ts,vl = fold3,fold2,fold2
k = findk(tr,vl,"unigrams",1)
uf += accuracy(tr,ts,'unigrams',1,k)

k = findk(tr,vl,"unigrams",0)
up += accuracy(tr,ts,'unigrams',0,k)

k = findk(tr,vl,"bigrams",1)
bf += accuracy(tr,ts,'bigrams',1,k)

k = findk(tr,vl,"bigrams",0)
bp += accuracy(tr,ts,'bigrams',0,k)

k = findk(tr,vl,"unibigrams",1)
ubf += accuracy(tr,ts,'unibigrams',1,k)

k = findk(tr,vl,"unibigrams",0)
ubp += accuracy(tr,ts,'unibigrams',0,k)




tr,ts,vl = fold1,fold3,fold3
k = findk(tr,vl,"unigrams",1)
uf += accuracy(tr,ts,'unigrams',1,k)

k = findk(tr,vl,"unigrams",0)
up += accuracy(tr,ts,'unigrams',0,k)

k = findk(tr,vl,"bigrams",1)
bf += accuracy(tr,ts,'bigrams',1,k)

k = findk(tr,vl,"bigrams",0)
bp += accuracy(tr,ts,'bigrams',0,k)

k = findk(tr,vl,"unibigrams",1)
ubf += accuracy(tr,ts,'unibigrams',1,k)

k = findk(tr,vl,"unibigrams",0)
ubp += accuracy(tr,ts,'unibigrams',0,k)


feature,fp,accu,myacc = [],[],[],[]
feature.append("Unigrams")
feature.append("Unigrams")
feature.append("Bigrams")
feature.append("Bigrams")
feature.append("Unigrams+Bigrams")
feature.append("Unigrams+Bigrams")

fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")
fp.append("Fres")
fp.append("Pres")

accu.append(0.7428)
accu.append(0.7764)
accu.append(0.6521)
accu.append(0.6471)
accu.append(0.7400)
accu.append(0.7700)

myacc.append(uf/3)
myacc.append(up/3)
myacc.append(bf/3)
myacc.append(bp/3)
myacc.append(ubf/3)
myacc.append(ubp/3)

final = pd.DataFrame()
final['Features'] = feature
final['Fres./Pres.'] = fp
final['Accuracy'] = accu
final['My Accuracy'] = myacc
print(final)

best k  10000
best k  20000
best k  20000
best k  20000
best k  4000
best k  6000
best k  20000
best k  10000
best k  16000
best k  20000
best k  20000
best k  12000
best k  4000
best k  8000
best k  18000
best k  16000
best k  8000
best k  8000
           Features Fres./Pres.  Accuracy  My Accuracy
0          Unigrams        Fres    0.7428     0.745696
1          Unigrams        Pres    0.7764     0.776421
2           Bigrams        Fres    0.6521     0.652125
3           Bigrams        Pres    0.6471     0.651388
4  Unigrams+Bigrams        Fres    0.7400     0.738546
5  Unigrams+Bigrams        Pres    0.7700     0.770713
