# Preprocessing Data

In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer 

In [2]:
#Import training and test corpus
df_train = pd.read_csv('../kaggle-competition-2/train_data.csv')
df_test = pd.read_csv('../kaggle-competition-2/test_data.csv')

In [3]:
def preprocessing(df):
    sentences = df.copy()
    # Converting all the upper case to lower case to avoid the distinction between them
    sentences['text'] = df['text'].str.lower()
    # Putting the regex for removing the https and www URLs
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))

    # Remove the video and links
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'{link}', '', x))
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"\[video\]", '', x))

    # Remove html reference characters
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'&[a-z]+;', '', x))

    # Remove usernames
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'@[^\s]+', '', x))

    # Removing numbers
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'\d+', '', x))

    # Removing hashmarks, non-letter characters
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', x))
        
    return sentences

In [4]:
#Preprocess the senteces 
train_proc = preprocessing(df_train)
test_proc = preprocessing(df_test)

In [5]:
#Create the encoder corpus
enc_corp = pd.concat([train_proc['text'],test_proc['text']])


# Word2Vec : CBOW and SkipGram

In [6]:
#Transform sentences to list of words
sentences_corp = enc_corp.apply(lambda x: x.split(' '))
sentences_corp.head()

0           [anyway, im, getting, of, for, a, while, ]
1    [my, red, apache, isn't, feelin, too, well, th...
2    [, you, should, be, , its, great, friday, will...
3    [its, :pm, and, i, dont, wanna, sleep;, so, i,...
4    [why, does, twitter, eat, my, dm's, , not, hap...
Name: text, dtype: object

In [7]:
#Train Word2Vec encoding with Skip-Gram and vector size of 100
word2vec_SkipGram_100d = Word2Vec(sentences=sentences_corp, sg=1, vector_size=100, workers=4)

In [8]:
#Train Word2Vec encoding with Skip-Gram and vector size of 300
word2vec_SkipGram_300d = Word2Vec(sentences=sentences_corp, sg=1, vector_size=300, workers=4)

In [9]:
#Train Word2Vec encoding with Skip-Gram and vector size of 100
word2vec_CBOW_100d = Word2Vec(sentences=sentences_corp, sg=0, vector_size=100, workers=4)

In [10]:
#Train Word2Vec encoding with Skip-Gram and vector size of 300
word2vec_CBOW_300d = Word2Vec(sentences=sentences_corp, sg=0, vector_size=300, workers=4)

In [11]:
#Test our encoding
print(len(list(word2vec_SkipGram_100d.wv.index_to_key)))
print(len(list(word2vec_CBOW_100d.wv.index_to_key)))
print(len(list(word2vec_SkipGram_300d.wv.index_to_key)))
print(len(list(word2vec_CBOW_300d.wv.index_to_key)))

59347
59347
59347
59347


In [17]:
#Find words similar to computer
#We can see that the 300d vector seems less accurate
print(word2vec_SkipGram_100d.wv.most_similar('computer'))
print(word2vec_SkipGram_300d.wv.most_similar('computer'))

[('laptop', 0.9002227783203125), ('pc', 0.8388065099716187), ('comp', 0.8185139298439026), ('hiptop', 0.7889953851699829), ('lappy', 0.7790974378585815), ('compy', 0.7786740660667419), ('crackberry', 0.7741041779518127), ('lappie', 0.7612731456756592), ('internet', 0.7598783373832703), ("phone's", 0.7594239115715027)]
[('laptop', 0.7479316592216492), ('comp', 0.7214232683181763), ('pc', 0.7158175110816956), ('compy', 0.6317026019096375), ('hiptop', 0.6210114359855652), ('lappy', 0.6167007684707642), ('puter', 0.612076461315155), ('lappie', 0.602472722530365), ("computer's", 0.6002691984176636), ('harddrive', 0.5867507457733154)]


In [18]:
#Find words similar to computer
#Once again the 300d vector seems less accurate
print(word2vec_CBOW_100d.wv.most_similar('computer'))
print(word2vec_CBOW_300d.wv.most_similar('computer'))

[('laptop', 0.9102173447608948), ('pc', 0.8555576205253601), ('comp', 0.8389325141906738), ('phone', 0.7491770386695862), ('lappy', 0.743495762348175), ('crackberry', 0.7390615344047546), ('internet', 0.7363808751106262), ('router', 0.7212945818901062), ('keyboard', 0.7173010110855103), ('blackberry', 0.7102299332618713)]
[('laptop', 0.8476789593696594), ('comp', 0.8177416324615479), ('pc', 0.8091326951980591), ('lappy', 0.692457914352417), ('crackberry', 0.6732901930809021), ('keyboard', 0.6669068932533264), ('internet', 0.6639253497123718), ('router', 0.6499288082122803), ('ipod', 0.6441949605941772), ('mbp', 0.6388487219810486)]


In [19]:
#Save the models
word2vec_SkipGram_100d.save('../Encoders/word2vec_SkipGram_100d')
word2vec_CBOW_100d.save('../Encoders/word2vec_CBOW_100d')

In [20]:
#Example of loading model
new_model = Word2Vec.load('../Encoders/word2vec_CBOW_100d')
new_model.wv.most_similar('computer')

[('laptop', 0.9102173447608948),
 ('pc', 0.8555576205253601),
 ('comp', 0.8389325141906738),
 ('phone', 0.7491770386695862),
 ('lappy', 0.743495762348175),
 ('crackberry', 0.7390615344047546),
 ('internet', 0.7363808751106262),
 ('router', 0.7212945818901062),
 ('keyboard', 0.7173010110855103),
 ('blackberry', 0.7102299332618713)]