# Neural Net

We will train a recurrent neural network to do sentiment analysis. We will test which hyper-parameters and which encoding gives us the best results.

In [17]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec

## Preprocessing Data

In [10]:
#Import the data
df_train = pd.read_csv('../kaggle-competition-2/train_data.csv')
df_test = pd.read_csv('../kaggle-competition-2/test_data.csv')
df_train_labels = pd.read_csv('../kaggle-competition-2/train_results.csv')

In [12]:
#Rename the labels
df_train_labels.loc[df_train_labels['target']=='negative','target'] = 0
df_train_labels.loc[df_train_labels['target']=='positive','target'] = 1
df_train_labels.loc[df_train_labels['target']=='neutral','target'] = 2

In [13]:
def preprocessing(df):
    sentences = df.copy()
    # Converting all the upper case to lower case to avoid the distinction between them
    sentences['text'] = df['text'].str.lower()
    # Putting the regex for removing the https and www URLs
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))

    # Remove the video and links
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'{link}', '', x))
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"\[video\]", '', x))

    # Remove html reference characters
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'&[a-z]+;', '', x))

    # Remove usernames
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'@[^\s]+', '', x))

    # Removing numbers
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r'\d+', '', x))

    # Removing hashmarks, non-letter characters
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', x))

    # Removing all extra same letters to a limit of 2, ex. daaaang => daang, nooooooo => noo
    sentences['text'] = sentences['text'].apply(lambda x: re.sub(r"(.)\1+", r"\1\1", x))
        
    return sentences

In [16]:
#Preprocess the senteces 
train_proc = preprocessing(df_train)
test_proc = preprocessing(df_test)

In [18]:
#Load the encoders
word2vec_CBOW_100d = Word2Vec.load('../Encoders/word2vec_CBOW_100d')
word2vec_SkipGram_100d = Word2Vec.load('../Encoders/word2vec_SkipGram_100d')