# Neural Net

We will train a recurrent neural network to do sentiment analysis. We will test which hyper-parameters and which encoding gives us the best results.

In [11]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

## Preprocessing Data

In [2]:
#Import the data
df_train = pd.read_csv('../kaggle-competition-2/train_data.csv')
df_test = pd.read_csv('../kaggle-competition-2/test_data.csv')
df_train_labels = pd.read_csv('../kaggle-competition-2/train_results.csv')

In [3]:
#Rename the labels
df_train_labels.loc[df_train_labels['target']=='negative','target'] = 0
df_train_labels.loc[df_train_labels['target']=='positive','target'] = 1
df_train_labels.loc[df_train_labels['target']=='neutral','target'] = 2

In [4]:
def preprocessing(df):
    sentences = df.copy()
    # Converting all the upper case to lower case to avoid the distinction between them
    sentences['text'] = df['text'].str.lower()
    # Putting the regex for removing the https and www URLs
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))

    # Remove the video and links
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'{link}', '', x))
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"\[video\]", '', x))

    # Remove html reference characters
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'&[a-z]+;', '', x))

    # Remove usernames
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'@[^\s]+', '', x))

    # Removing numbers
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'\d+', '', x))

    # Removing hashmarks, non-letter characters
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', x))

    # Removing all extra same letters to a limit of 2, ex. daaaang => daang, nooooooo => noo
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"(.)\1+", r"\1\1", x))
        
    return sentences

In [5]:
#Preprocess the senteces 
train_proc = preprocessing(df_train)
test_proc = preprocessing(df_test)
train_proc['text'] = train_proc['text'].apply(lambda x: x.split(' '))
test_proc['text'] = test_proc['text'].apply(lambda x: x.split(' '))

In [6]:
#Get the minium and maximum number of words for each text
print(max(max(train_proc['text'].apply(lambda x: len(x))),max(test_proc['text'].apply(lambda x: len(x)))))
print(min(min(train_proc['text'].apply(lambda x: len(x))),min(test_proc['text'].apply(lambda x: len(x)))))

62
2


In [7]:
#Load the encoders
word2vec_CBOW_100d = Word2Vec.load('../Encoders/word2vec_CBOW_100d')
word2vec_SkipGram_100d = Word2Vec.load('../Encoders/word2vec_SkipGram_100d')

In [8]:
#Functions to apply the word2vec encodinng to each word
def applyWord2VecCBOW_AVG(listWords):
    newList = []
    for i in listWords:
        if i in word2vec_CBOW_100d.wv:
            newList.append(word2vec_CBOW_100d.wv[i])
    return np.mean(newList,axis=0)

def applyWord2VecSG_AVG(listWords):
    newList = []
    for i in listWords:
        if i in word2vec_SkipGram_100d.wv:
            newList.append(word2vec_SkipGram_100d.wv[i])
    return np.mean(newList,axis=0)


In [9]:
#Create all datasets with different encodings
CBOW_Train_AVG = train_proc['text'].apply(applyWord2VecCBOW_AVG)
SG_Train_AVG = train_proc['text'].apply(applyWord2VecSG_AVG)

CBOW_Test_AVG = test_proc['text'].apply(applyWord2VecCBOW_AVG)
SG_Test_AVG = test_proc['text'].apply(applyWord2VecSG_AVG)

In [10]:
#Transform the data into a matrix
CBOW_Train_AVG_X = pd.DataFrame(CBOW_Train_AVG.tolist(), index= CBOW_Train_AVG.index)
SG_Train_AVG_X = pd.DataFrame(SG_Train_AVG.tolist(), index= CBOW_Train_AVG.index)
CBOW_Test_AVG_X = pd.DataFrame(CBOW_Test_AVG.tolist(), index= CBOW_Train_AVG.index)
SG_Test_AVG_X = pd.DataFrame(SG_Test_AVG.tolist(), index= CBOW_Train_AVG.index)

In [None]:
#Split the data into training and validation 70-30
CBOW_train_x, CBOW_valid_x, CBOW_train_y, CBOW_valid_y = train_test_split(CBOW_Train_AVG_X, df_train_labels, test_size=0.3, random_state=1) 
SG_train_x, SG_valid_x, SG_train_y, SG_valid_y = train_test_split(SG_Train_AVG_X, df_train_labels, test_size=0.3, random_state=1) 