# Cleaning up data
Gathering all training data together and removing instances with missing values. Also adds all other feature columns. Adds features to test set as well.

In [1]:
import pandas as pd

c1 = pd.read_csv("../toxic_comment_data/train.csv")
c2 = pd.read_csv("../toxic_comment_data/test_with_solutions.csv")
comments = pd.concat([c1,c2])
comments = comments[pd.notnull(comments['Date'])]
comments = comments.drop('Usage', axis=1)

test = pd.read_csv("../toxic_comment_data/impermium_verification_set.csv")
test2 = pd.read_csv("../toxic_comment_data/impermium_verification_labels.csv")

In [2]:
def true_length(input):
    if type(input) is str:
        return len(bytes(input, 'ascii').decode('unicode-escape'))
    else: 
        return -1 #arbitrary, should not ever happen with this dataset
    
def add_feature_columns(dataframe):
    # Add length features
    dataframe['Raw_Length']  = dataframe['Comment'].str.len()
    dataframe['True_Length'] = dataframe['Comment'].apply(lambda x: true_length(x))

    col_max = dataframe['True_Length'].max()
    col_min = dataframe['True_Length'].min()
    dataframe['Norm_True_Length'] = dataframe['True_Length'].apply(lambda x: (x-col_min)/(col_max-col_min))

    # Add date features
    dataframe['Date'] = pd.to_datetime(dataframe.Date, format='%Y%m%d%H%M%SZ')

    dataframe['Weekday'] = dataframe.Date.dt.weekday
    dataframe['Day'] = dataframe.Date.dt.day
    dataframe['Month'] = dataframe.Date.dt.month
    dataframe['Year'] = dataframe.Date.dt.year
    dataframe['Hour'] = dataframe.Date.dt.hour
    dataframe['Minute'] = dataframe.Date.dt.minute
    dataframe['Second'] = dataframe.Date.dt.second
    
    return dataframe

In [3]:
comments = add_feature_columns(comments)
test = add_feature_columns(test)
test2 = add_feature_columns(test2)

In [4]:
comments.to_pickle('../toxic_comment_data/combined_train_data.csv')
test.to_pickle('../toxic_comment_data/unlabeled_test_set_with_features.csv')
test2.to_pickle('../toxic_comment_data/labeled_test_set_with_features.csv')

In [5]:
comments = pd.read_pickle('../toxic_comment_data/combined_train_data')