# Twitter Sentiment Analysis (Deep Learning)

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import time
import functools
import re

train_df = pd.read_csv('twitter_training.csv', header=None)
val_df = pd.read_csv('twitter_validation.csv', header=None)

def rename_columns(df):
    return df.rename(columns={
        0: 'tweet_id',
        1: 'subject',
        2:'sentiment_class',
        3:'text'
    }, inplace=True)

def drop_columns(df):
    df.drop(
        ['subject', "tweet_id"],
        axis=1,
        inplace=True
    )

def convert_types(df):
    df['text'] = df['text'].astype(str).fillna('')

rename_columns(train_df)
rename_columns(val_df)

drop_columns(train_df)
drop_columns(val_df)

convert_types(train_df)
convert_types(val_df)

train_df.head(15)

# Preprocessing

### Description
In this step we will begin to clean the data so that we can make our model more effective in the future. 

The first thing that we need to take care of are the special characters. Our model can only understand the input as numbers (not text), therefore, our vocabulary shouldn't end up confusing a word like 'good!' with 'good'. They both mean the same thing and having the special character will only hurt our model at the end of the day, especially when we begin to word embed. 

The second thing that we need to take care of is the punctuation, as everything should be in lower case for the same reason as above so that 'Good' is not different from 'good' for example. 

### Example Cleaning
*Before*:   If you don't know that I'm a huge @ Borderlands fan and Maya is one of my favorite characters,

*After*:    if you dont know that im a huge  borderlands fan and maya is one of my favorite characters 

In [None]:
def preprocessing(df):
    """
    This function removes special characters and turns everything into lowercase to improve model accuracy
    """

    for i, string in enumerate(df['text']):
        if isinstance(string, str):
            replacement = re.sub('[^A-Za-z0-9 ]+', '', string)
            replacement = replacement.lower()
            df.loc[i, 'text'] = replacement
preprocessing(train_df)
preprocessing(val_df)

train_df

# Tokenization

### Description
This is the process of learning the vocabulary of words that exist in our text column, and then turning the string of words into a numeric representation

### Methods used
*fit_on_texts* is used to update the vocabulary of what exists inside of text

*texts_to_sequences* is used to transform each text into a sequence of integers for our model

*pad_sequences* is a method that I learned online that is used to keep all sequences the same length to create uniformity. It is likely that the majority of text have different sizes, and this just pads it with zeros. 

In [None]:
def tokenize(df):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df['text'])
    sequences = tokenizer.texts_to_sequences(df['text'])
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')
    return tokenizer, padded_sequences

train_tokenizer, train_sequences = tokenize(train_df)
val_tokenizer, val_sequences = tokenize(val_df)

### Example

In our very first row of the training dataset, the word borderlands appears. Below shows the process of how we can actually find where borderlands exists in the tokenized vocabulary. 

We notice that in the long list of words, that borderlands exists as 129 in numerical form in our padded training sequence. 

In [None]:
print(train_df.head(1), end='\n\n------------------------------\n')
print("Index in train_sequence:", train_tokenizer.word_index['borderlands'], end='\n------------------------------')

# Next Steps:

I belive it would be best practice to flatten this into a vector and use it as the input into the model, I will take the day to do some more research before coding along aimlessly

In [None]:
print(train_sequences[:2])