In [3]:
# Get all the necessary libraries and data
from __future__ import print_function
import os

import pandas as pd
import seaborn as sb
import numpy as np

# The labelled training data
data = pd.read_csv("csv-download/train.csv")
data = data.drop('Id', axis=1) # Drop the id axis, which brings nothing to the machine learning system

# The unlabelled test data for competition submission
sub_data = pd.read_csv("csv-download/test.csv")
sub_data = sub_data.drop('Id', axis=1)

In [4]:
# Fill all unknown values with 0 or "Unknown" so it can be properly one-hot encoded
for col in data.columns:
    if data[col].dtype == np.object:
        data[col] = data[col].fillna("Unknown")
    else:
        data[col] = data[col].fillna(0)

# Same for submission data
for col in sub_data.columns:
    if sub_data[col].dtype == np.object:
        sub_data[col] = sub_data[col].fillna("Unknown")
    else:
        sub_data[col] = sub_data[col].fillna(0)

In [5]:
# Find all data columns that need to be one-hot encoded
mask = data.dtypes == np.object
mask['MSSubClass'] = True  # This one is all numbers but still uses types - needs to be one-hot encoded as well
categorical_cols = data.columns[mask]

num_ohc_cols = (data[categorical_cols].apply(lambda x: x.nunique()).sort_values(ascending=False))

small_num_ohc_cols = num_ohc_cols.loc[num_ohc_cols>1] # Don't one-hot encode if there's only one type

small_num_ohc_cols -= 1

small_num_ohc_cols.sum()

# This mask will be used for both data and sub_data to ensure they are both similarly one-hot encoded

239

In [6]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data_ohc = data.copy()
sub_data_ohc = sub_data.copy()

le = LabelEncoder()
ohc = OneHotEncoder()

for col in num_ohc_cols.index:
    # Integer encode the string categories
    le.fit(np.concatenate((data_ohc[col], sub_data_ohc[col]), axis=None))
    dat = le.transform(data_ohc[col]).astype(np.int)
    sub_dat = le.transform(sub_data_ohc[col]).astype(np.int)
    
    # Remove the original column from the dataframe
    data_ohc = data_ohc.drop(col, axis=1)
    sub_data_ohc = sub_data_ohc.drop(col, axis=1)

    # One hot encode the data--this returns a sparse array
    ohc.fit(np.concatenate((dat, sub_dat),axis=None).reshape(-1,1))
    new_dat = ohc.transform(dat.reshape(-1,1))
    new_sub_dat = ohc.transform(sub_dat.reshape(-1,1))
    
    # Create unique column names
    n_cols = new_dat.shape[1]
    col_names = ['_'.join([col, str(le.inverse_transform([x])[0])]) for x in range(n_cols)]
    n_sub_cols = new_sub_dat.shape[1]
    sub_col_names = ['_'.join([col, str(le.inverse_transform([x])[0])]) for x in range(n_sub_cols)]

    # Create the new dataframe
    new_df = pd.DataFrame(new_dat.toarray(), 
                          index=data_ohc.index, 
                          columns=col_names)
    new_sub_df = pd.DataFrame(new_sub_dat.toarray(), 
                          index=sub_data_ohc.index, 
                          columns=sub_col_names)

    # Append the new data to the dataframe
    data_ohc = pd.concat([data_ohc, new_df], axis=1)
    sub_data_ohc = pd.concat([sub_data_ohc, new_sub_df], axis=1)

In [5]:
from sklearn.model_selection import train_test_split

# Put the SalePrice column at the end for easy finding in Octave
salePrice = data_ohc.pop('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(data_ohc, salePrice, train_size=0.8, random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, train_size=0.75, random_state=42)

In [9]:
# Export it all to .txt files
np.savetxt('txt-data/X.txt', X_train.values, fmt='%d')
np.savetxt('txt-data/Xtest.txt', X_test.values, fmt='%d')
np.savetxt('txt-data/Xcv.txt', X_cv.values, fmt='%d')
np.savetxt('txt-data/y.txt', y_train.values, fmt='%d')
np.savetxt('txt-data/ytest.txt', y_test.values, fmt='%d')
np.savetxt('txt-data/ycv.txt', y_cv.values, fmt='%d')
np.savetxt('txt-data/yfinal.txt', sub_data_ohc.values, fmt='%d')

#  In order to export to csv files
# X_train.to_csv('processed_Xtrain.csv', index=False)
# X_test.to_csv('processed_Xtest.csv', index=False)
# X_cv.to_csv('processed_Xcv.csv', index=False)
# y_train.to_csv('processed_ytrain.csv', index=False)
# y_test.to_csv('processed_ytest.csv', index=False)
# y_cv.to_csv('processed_ycv.csv', index=False)
# sub_data_ohc.to_csv('processed_sub_test.csv', index=False)