In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stpwords = stopwords.words('english')
import re
from gensim.models import Word2Vec
import os
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sandi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Reading all data and test ids
all_data = pd.read_csv('alldata.tsv', sep='\t')
testIDs = pd.read_csv('project3_splits.csv')

In [3]:
# Helper function for cleaning the text
def clean_corpus(text):
    '''
    INPUT
    text - string
    OUTPUT
    clean text
    This function processes the input using the following steps :
    1. Remove punctuation characters
    2. Remove stop words
    '''
    # Remove punctuation characters and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text)
    
    # Tokenize text
    tokens = word_tokenize(text)
        
    clean_text = ''
    for word in tokens:
        clean_tok = word.lower().strip()
        if clean_tok not in stpwords:
            clean_text += f'{clean_tok} '

    return clean_text

In [4]:
# Applying the cleaning column on the dataset
all_data['clean_text'] = all_data['review'].apply(clean_corpus)

In [6]:
# # Creating test and train splits
# for j in range(5):
#     dir_str = "split_"
#     os.mkdir(dir_str+str(j+1))
    
#     train = all_data.loc[~all_data['id'].isin(list(testIDs.iloc[:,j])), ["id","sentiment","review"]]
#     test = all_data.loc[all_data['id'].isin(list(testIDs.iloc[:,j])), ["id","review"]]
#     test_y = all_data.loc[all_data['id'].isin(list(testIDs.iloc[:,j])), ["id","sentiment","score"]]
    
#     tmp_file_name1 = "split_" + str(j+1) +"/" + "train.csv"
#     train.to_csv(tmp_file_name1, index = False)
    
#     tmp_file_name2 = "split_" + str(j+1) +"/" + "test.csv"
#     test.to_csv(tmp_file_name2, index = False)
    
#     tmp_file_name3 = "split_" + str(j+1) +"/" + "test_y.csv"
#     test_y.to_csv(tmp_file_name3, index = False)

In [7]:
# Creating the Tfidf vectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

tfv.fit(list(all_data.review))

TfidfVectorizer(min_df=3, ngram_range=(1, 2), smooth_idf=1,
                stop_words='english', strip_accents='unicode', sublinear_tf=1,
                token_pattern='\\w{1,}', use_idf=1)

In [9]:
# Training a logistic model
for i in range(5):
    file_nm1 = 'split_'+str(i+1)+'/train.tsv'
    train = pd.read_csv(file_nm1, sep='\t')
    
    del train['id']
    train_y = train.sentiment
    xtrain = train.review
    
    file_nm2 = 'split_'+str(i+1)+'/test.tsv'
    test = pd.read_csv(file_nm2, sep='\t')
    del test['id']
    
    file_nm3 = 'split_'+str(i+1)+'/test_y.tsv'
    test_y = pd.read_csv(file_nm3, sep='\t')
    xtest = test.review
    
    # creating label encoder
    lbl_enc = preprocessing.LabelEncoder()
    y = lbl_enc.fit_transform(train.sentiment)

    xtrain_tfv =  tfv.transform(xtrain) 
    xtest_tfv = tfv.transform(xtest)

    clf = LogisticRegression(C=1.0)
    clf.fit(xtrain_tfv, y)
    predictions = clf.predict(xtest_tfv)

    print(str(i+1)," : ",roc_auc_score(predictions,test_y.sentiment))

1  :  0.8964805560894356
2  :  0.892965452098464
3  :  0.8950435582293231
4  :  0.8955396603688884
5  :  0.8950965361244289


In [73]:
# Training an xgboost model
for i in range(5):
    file_nm1 = 'split_'+str(i+1)+'/train.csv'
    train = pd.read_csv(file_nm1)
    
    del train['id']
    train_y = train.sentiment
    xtrain = train.review
    
    file_nm2 = 'split_'+str(i+1)+'/test.csv'
    test = pd.read_csv(file_nm2)
    del test['id']
    
    file_nm3 = 'split_'+str(i+1)+'/test_y.csv'
    test_y = pd.read_csv(file_nm3)
    xtest = test.review
    
    # creating label encoder
    lbl_enc = preprocessing.LabelEncoder()
    y = lbl_enc.fit_transform(train.sentiment)

    xtrain_tfv =  tfv.transform(xtrain) 
    xtest_tfv = tfv.transform(xtest)

    clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
    
    clf.fit(xtrain_tfv.tocsc(), y)
    predictions = clf.predict(xtest_tfv.tocsc())

    print(str(i+1)," : ",roc_auc_score(predictions,test_y.sentiment))



1  :  0.8597650975544413




2  :  0.8586560262223772




3  :  0.8631979561874471




4  :  0.8593713367427287




5  :  0.8606716078829169


In [24]:
clean_corpus(all_data['review'][0])

'naturally film main themes mortality nostalgia loss innocence perhaps surprising rated highly older viewers younger ones however craftsmanship completeness film anyone enjoy pace steady constant characters full engaging relationships interactions natural showing need floods tears show emotion screams show fear shouting show dispute violence show anger naturally joyce short story lends film ready made structure perfect polished diamond small changes huston makes inclusion poem fit neatly truly masterpiece tact subtlety overwhelming beauty '

In [27]:
cleaned_review = [clean_corpus(x) for x in all_data.review]

In [28]:
cleaned_review[0]

'naturally film main themes mortality nostalgia loss innocence perhaps surprising rated highly older viewers younger ones however craftsmanship completeness film anyone enjoy pace steady constant characters full engaging relationships interactions natural showing need floods tears show emotion screams show fear shouting show dispute violence show anger naturally joyce short story lends film ready made structure perfect polished diamond small changes huston makes inclusion poem fit neatly truly masterpiece tact subtlety overwhelming beauty '

In [29]:
all_data['cleaned_review'] = cleaned_review

In [36]:
all_words = [nltk.word_tokenize(x) for x in cleaned_review]

In [63]:
word2vec = Word2Vec(all_words, min_count=5, max_final_vocab=4000)

In [39]:
word2vec

<gensim.models.word2vec.Word2Vec at 0x19daed34dc0>

In [68]:
vocabulary = word2vec.wv.key_to_index

In [64]:
v1 = word2vec.wv['movie']
v1

array([ 2.58842558e-01, -8.44656348e-01, -4.87949133e-01, -9.40350354e-01,
        2.66864657e-01,  2.94531137e-02, -1.73610330e+00, -5.67825079e-01,
        8.08339417e-01, -1.69808459e+00,  1.03057063e+00,  9.62835923e-03,
       -2.43424967e-01, -4.38597918e-01, -6.84791803e-01,  1.76150572e+00,
        8.65396798e-01,  1.86191440e+00, -1.70171249e+00,  2.79220670e-01,
       -3.38357627e-01, -2.32526157e-02,  2.17913091e-01, -4.84722517e-02,
        1.33072913e+00, -9.22531426e-01,  9.30391371e-01,  1.24785292e+00,
        2.34947711e-01,  1.45575678e+00,  6.19320333e-01, -7.44338453e-01,
       -2.53127009e-01, -1.18179643e+00, -2.42393002e-01,  4.00296271e-01,
        1.19838119e+00,  7.86305845e-01,  1.48067081e+00,  2.49931979e+00,
        1.02869177e+00, -1.41337092e-04, -1.42204297e+00, -5.92005849e-01,
       -1.19590126e-01,  3.18989754e-01,  1.60031188e+00, -2.38928959e-01,
        4.66436893e-01,  4.89191979e-01,  7.68740416e-01,  6.45394504e-01,
        1.87487364e-01,  

In [65]:
v1.shape

(100,)

In [69]:
len(vocabulary)

3980

In [None]:
vocabulary