In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import joblib
import pickle
import time
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier

In [2]:
df=pd.read_pickle('Data/data_cleaned')

In [3]:
df

Unnamed: 0,score,text
0,3.0,"[good, helpfull, read, book, good, type, thats..."
1,1.0,"[sadly, overprice, irrelevant, spite, claim, i..."
2,2.0,"[endless, rant, howard, borrow, dennis, miller..."
3,1.0,"[not, quite, hip, really, shame, time, reserch..."
4,5.0,"[journey, centre, earth, hey, great, book, abs..."
...,...,...
49995,5.0,"[star, short, easy, explanation, follow, lot, ..."
49996,5.0,"[michael, hague, illustration, best, buy, love..."
49997,5.0,"[quotamerican, beautyquot, novel, dark, comedy..."
49998,4.0,"[funny, quirky, really, funny, witty, book, su..."


In [5]:
def split_train_test(df, test_size=0.2, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(df['text'], 
                                                        df['score'], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

In [6]:
X_train, X_test, y_train, y_test = split_train_test(df)

Value counts for Train sentiments
5.0    24226
4.0     7728
3.0     3422
1.0     2638
2.0     1986
Name: score, dtype: int64
Value counts for Test sentiments
5.0    6042
4.0    1963
3.0     854
1.0     639
2.0     502
Name: score, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
   index                                               text
0  45570  [multidimensional, thought, book, good, job, p...
1  21855  [magnetic, influence, success, business, find,...
2  41127  [read, weep, world, full, bad, book, write, am...
3   4849  [clear, channel, watt, book, great, show, book...
4  20742  [page, miss, line, mass, market, paperback, ed...


In [7]:
X_train.to_pickle('Data/X_train')
X_test.to_pickle('Data/X_test')
y_train.to_pickle('Data/y_train')
y_test.to_pickle('Data/y_test')

In [6]:
X_train=pd.read_pickle('Data/X_train')
X_test=pd.read_pickle('Data/X_test')
y_train=pd.read_pickle('Data/y_train')
y_test=pd.read_pickle('Data/y_test')

In [9]:
vector_size=1000
#The number of dimensions of embeddings, the default is 100
window=3
#The maximum distance between a target word and its neighbours, default is 5
min_count=3
#The minimum counts of words in order for the word to be considered for the training of model, default is 5
workers=3
#The number of partitions during training, default is 3
sg=1
#The training algorithom, either CBOW(0) or skip gram(1), default is CBOW

#tokens = pd.Series(df['tokenized_text']).values
start_t = time.time()
w2vmodel=Word2Vec(X_train['text'],min_count=min_count,vector_size=vector_size,workers=workers,sg=sg)
print('Time taken to train word2vec model:' +str(time.time()-start_t))

Time taken to train word2vec model:116.67597889900208


In [10]:
filename='w2v_model.joblib'
joblib.dump(w2vmodel,filename)

['w2v_model.joblib']

In [7]:
w2vmodel=joblib.load('w2v_model.joblib')

In [8]:
word2vec_model_file = '/Users/xin/Library/CloudStorage/OneDrive-UniversityofBristol/DST/DST_Assessment2/Xin/Data/' + 'word2vec_' + '.model'
w2vmodel.save(word2vec_model_file)

In [9]:
w2v_model = Word2Vec.load(word2vec_model_file) #we have to save and load for some of the function below to work

In [10]:
w2v_model.wv.key_to_index

{'book': 0,
 'not': 1,
 'read': 2,
 'one': 3,
 'story': 4,
 'like': 5,
 'would': 6,
 'time': 7,
 'great': 8,
 'make': 9,
 'good': 10,
 'write': 11,
 'get': 12,
 'find': 13,
 'character': 14,
 'love': 15,
 'life': 16,
 'well': 17,
 'think': 18,
 'novel': 19,
 'first': 20,
 'many': 21,
 'much': 22,
 'know': 23,
 'work': 24,
 'people': 25,
 'take': 26,
 'also': 27,
 'way': 28,
 'give': 29,
 'even': 30,
 'really': 31,
 'could': 32,
 'year': 33,
 'say': 34,
 'author': 35,
 'want': 36,
 'see': 37,
 'world': 38,
 'end': 39,
 'reader': 40,
 'come': 41,
 'best': 42,
 'use': 43,
 'thing': 44,
 'go': 45,
 'new': 46,
 'look': 47,
 'little': 48,
 'page': 49,
 'never': 50,
 'seem': 51,
 'recommend': 52,
 'two': 53,
 'man': 54,
 'enjoy': 55,
 'still': 56,
 'tell': 57,
 'ever': 58,
 'child': 59,
 'part': 60,
 'feel': 61,
 'old': 62,
 'start': 63,
 'live': 64,
 'need': 65,
 'series': 66,
 'woman': 67,
 'every': 68,
 'try': 69,
 'must': 70,
 'put': 71,
 'back': 72,
 'history': 73,
 'help': 74,
 'point':

In [11]:
emb_df = (
    pd.DataFrame(
        [w2v_model.wv.get_vector(str(n)) for n in w2vmodel.wv.key_to_index],
        index = w2vmodel.wv.key_to_index
    )
)
print(emb_df.shape)

emb_df.head()

(41595, 1000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
book,0.093191,0.017452,0.033683,0.163666,0.01923,-0.092321,0.107138,0.077283,0.01158,-0.05157,...,-0.10467,-0.083952,0.090864,-0.004854,0.125868,-0.046788,-0.04221,-0.113314,0.046125,0.017304
not,0.049468,0.058147,0.046993,0.039899,-0.090384,-0.056956,0.097288,0.028193,-0.0079,0.016054,...,0.028335,-0.059975,0.085465,0.098323,0.023309,-0.061541,-0.132283,-0.06079,-0.08497,-0.008647
read,0.062747,-0.082553,0.035017,0.248178,0.018828,-0.028716,0.053239,0.043143,-0.069352,0.036031,...,-0.068876,0.031968,0.130032,0.059487,0.113676,0.007758,-0.054621,-0.017854,0.044061,-0.073668
one,0.067632,0.057678,-0.024377,0.147654,-0.089571,-0.070021,0.079438,0.109028,-0.023408,0.111963,...,0.088833,-0.080763,0.078768,0.057274,0.041459,-0.0023,-0.085874,-0.046879,-0.0701,-0.023518
story,0.107059,-0.128003,-0.032419,0.046873,0.065347,-0.158554,0.062178,-0.002045,0.053513,-0.029064,...,-0.023479,0.018656,-0.031661,0.067833,0.147765,0.069098,-0.063918,-0.1787,-0.087061,0.004072


In [12]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index_to_key)
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model.wv[word])
    
    if nwords ==0:
        featureVec = np.zeros(1000)
        #avoiding divide by zero, define it as a all zero vector
    else:
        featureVec = np.divide(featureVec, nwords)
    # Dividing the result by number of words to get average

    return featureVec

In [13]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Comment %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
            
        counter = counter+1
        
    return reviewFeatureVecs

In [15]:
trainDataVecs = getAvgFeatureVecs(X_train['text'], w2v_model, 1000)

Comment 0 of 40000
Comment 1000 of 40000
Comment 2000 of 40000
Comment 3000 of 40000
Comment 4000 of 40000
Comment 5000 of 40000
Comment 6000 of 40000
Comment 7000 of 40000
Comment 8000 of 40000
Comment 9000 of 40000
Comment 10000 of 40000
Comment 11000 of 40000
Comment 12000 of 40000
Comment 13000 of 40000
Comment 14000 of 40000
Comment 15000 of 40000
Comment 16000 of 40000
Comment 17000 of 40000
Comment 18000 of 40000
Comment 19000 of 40000
Comment 20000 of 40000
Comment 21000 of 40000
Comment 22000 of 40000
Comment 23000 of 40000
Comment 24000 of 40000
Comment 25000 of 40000
Comment 26000 of 40000
Comment 27000 of 40000
Comment 28000 of 40000
Comment 29000 of 40000
Comment 30000 of 40000
Comment 31000 of 40000
Comment 32000 of 40000
Comment 33000 of 40000
Comment 34000 of 40000
Comment 35000 of 40000
Comment 36000 of 40000
Comment 37000 of 40000
Comment 38000 of 40000
Comment 39000 of 40000


In [16]:
y_train.reset_index()

Unnamed: 0,level_0,index,score
0,0,45570,4.0
1,1,21855,5.0
2,2,41127,1.0
3,3,4849,5.0
4,4,20742,1.0
...,...,...,...
39995,39995,39296,5.0
39996,39996,49015,5.0
39997,39997,2693,5.0
39998,39998,8076,5.0


In [17]:
def remove_unuseful_rows(trainDataVecs,y_train):
    list=[]
    for i in range(trainDataVecs.shape[0]):
        if np.all(trainDataVecs[i,]==0):
            list.append(i)
    if len(list)!=0:
        trainDataVecs=np.delete(trainDataVecs,list,axis=0)
        y_train.drop(index=list,inplace=True)
    return trainDataVecs,y_train
# remove the sample with all zero vector and its corresponding y

In [18]:
trainDataVecs1,y_train1=remove_unuseful_rows(trainDataVecs,y_train)

In [28]:
y_train1.shape

(39999, 2)

In [30]:
trainDataVecs1.shape

(39999, 1000)

## Cross Validation

In [23]:
cv = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
forest1 = RandomForestClassifier(n_estimators = 10)
forest2 = RandomForestClassifier(n_estimators = 100)
forest3 = RandomForestClassifier(n_estimators = 500)
forest4 = RandomForestClassifier(n_estimators = 1000)

In [25]:
scores1 = cross_val_score(forest1, trainDataVecs1, y_train1['score'], scoring='roc_auc', cv=cv, n_jobs=-1)

Traceback (most recent call last):
  File "/Users/xin/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/xin/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Users/xin/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 349, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "/Users/xin/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/xin/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/

In [26]:
scores1

array([nan, nan, nan, nan, nan])

In [None]:
start_time = time.time()
# Fit the model
forest2 = forest2.fit(trainDataVecs1, y_train1["score"])
print("Time taken to fit the random forest model 2 with word2vec vectors: " + str(time.time() - start_time))

## Test data set

In [None]:
vector_size=1000
window=3
min_count=3
workers=3
sg=1

start_t = time.time()
test_w2vmodel=Word2Vec(X_test['text'],min_count=min_count,vector_size=vector_size,workers=workers,sg=sg)
print('Time taken to test word2vec model:' +str(time.time()-start_t))

In [None]:
filename='test_w2v_model.joblib'
joblib.dump(test_w2vmodel,filename)

In [None]:
test_w2vmodel=joblib.load('test_w2v_model.joblib')

In [None]:
test_word2vec_model_file = '/Users/xin/Library/CloudStorage/OneDrive-UniversityofBristol/DST/DST_Assessment2/Xin/Data/' + 'test_word2vec_' + '.model'
test_w2vmodel.save(test_word2vec_model_file)

In [None]:
test_w2v_model = Word2Vec.load(test_word2vec_model_file)

In [None]:
test_emb_df = (
    pd.DataFrame(
        [test_w2v_model.wv.get_vector(str(n)) for n in test_w2vmodel.wv.key_to_index],
        index = test_w2vmodel.wv.key_to_index
    )
)
print(test_emb_df.shape)

test_emb_df.head()

In [None]:
testDataVecs = getAvgFeatureVecs(X_test['text'], test_w2v_model, 1000)

In [None]:
testDataVecs1,y_test1=remove_unuseful_rows(testDataVecs,y_test)

In [None]:
prediction=forest.predict(testDataVecs1)

In [None]:
len(prediction)

In [None]:
np.dot(prediction-y_test1['score'],prediction-y_test1['score'])

In [None]:
sum(prediction-y_test1['score']==0)