In [22]:
import numpy as np
import pandas as pd
from pandas import Categorical
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [23]:
data = pd.read_csv('./joined_data.csv')

Categorize the features into four categories:
    - Discarded (e,g. NumMosquitos, Date, WnvPresent)
    - Supportive (e.g. ID, Dataset)
    - Numeric
    - Categorical

We'll then produce three dataframes:
    - One dataframe for the supportive columns.
    - One dataframe for the numeric columns, where each column has been
      normalized to have mean 0 and standard deviation 1.
    - One dataframe for the categorical columns, where we'll use 
      LabelBinarizer() to expand them out into binary columns.

Then we'll join the three dataframes together into a binarized dataframe of all our desired features

In [24]:
features_to_delete = [
    'Date','Address','Block','Trap','Street',
    'AddressNumberAndStreet','AddressAccuracy',
    'NumMosquitos','Sunrise','Sunset'
]

support_features = [
    'Dataset','ID','WnvPresent'
]

numeric_features = [
    'Latitude','Longitude','Tmax','Tmin','Tavg',
    'Depart','DewPoint','WetBulb','Heat','Cool',
    'PrecipTotal','StnPressure','SeaLevel',
    'ResultSpeed','ResultDir','AvgSpeed','SprayIntensity'
]

categorical_features = [
    'Year','Month','YearMonth','Week',
    'Species','CodeSum'
]

In [25]:
support = data[support_features]
numerics = data[numeric_features]
categoricals = data[categorical_features]

In [26]:
numerics = (numerics - numerics.mean()) / numerics.std()

Now we'll label encode and binarize the categorical features

In [27]:
for col in categoricals:
    le = LabelEncoder()
    binarized_df = pd.DataFrame(
        data = LabelBinarizer().fit_transform(le.fit_transform(data[col])),
        columns = [col + "_" + str(feature) for feature in le.classes_]
    )
    categoricals = categoricals.drop(col,axis=1)
    categoricals = pd.concat([categoricals,binarized_df],axis=1)

In [34]:
data = pd.concat([support,numerics,categoricals],axis=1)

In [39]:
data.to_csv("./binarized_data.csv",index=False)