In [19]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [26]:
iowa_houses = pd.read_csv('train.csv')
iowa_houses.drop('Id', axis = 1, inplace = True)

In [27]:
##########################################
# Convert All Nulls, and Nones to Absent #
##########################################
def fix_nulls(dframe):
    df = dframe.copy(deep = True)
    df.fillna("Absent", inplace = True)
    df.replace({'None': 'Absent'}, regex = True, inplace = True)
    return df

###########################################
# Change Certain Columns to Category Type #
###########################################
def convert_cols(dframe, cols):
    df = dframe.copy(deep = True)
    for i in cols:
        df[i] = df[i].astype('category', errors = 'ignore')
    return df

################################
# Log Transform Certain Columns#
################################
def log_transf(dframe, cols):
    df = dframe.copy(deep = True)
    for i in cols:
        df[i] = df[i].apply(lambda x: np.log(x+1))
#         df.loc[df[i] == -np.inf, i] = 0
    return df

###################################
# Convert Entries to 0 In Columns #
###################################
def make_zeros(dframe, cols):
    df = dframe.copy(deep = True)
    for i in cols:
        df[i] = df[i].fillna(0)
#         df.loc[df[i] == "Absent", i] = 0
    return df

############################################
# Change Certain Columns to Numerical Type #
############################################
def convert_cols_num(dframe, cols):
    df = dframe.copy(deep = True)
    for i in cols:
        df[i] = pd.to_numeric(df[i], errors='coerce')
    return df

###################################
# Convert Entries to Yes/No Value #
###################################
def binarize(dframe, cols):
    df = dframe.copy(deep = True)
    for i in cols:
        df.loc[df[i] != "Absent", i] = "Yes"
        df.loc[df[i] == "Absent", i] = "No"
    return df

######################################
# Impute NaNs By Median Neighborhood #
######################################
def impute_absents(dframe):
#     neigh_median = dframe.groupby('Neighborhood').agg({'LotFrontage':'median'})
#     neigh_median = neigh_median.to_dict()['LotFrontage']
    
#     df = dframe.copy(deep = True)
#     df.loc[df.LotFrontage == "Absent", "LotFrontage"] = df.apply(lambda x: )
    pass

In [28]:
cat_cols = ["MSSubClass", "MSZoning", "Street", "Alley", "LotShape",
            "LandContour", "Utilities", "LotConfig", "LandSlope", 
            "Neighborhood", "Condition1", "Condition2", "BldgType",
            "MoSold", "YrSold"]
num_cols = ["LotFrontage", "LotArea", "MasVnrArea"]
bin_cols = ["GarageYrBlt", "PoolQC", "Fence", "MiscFeature"]
log_cols = ["MasVnrArea"]

In [29]:
cleaned = fix_nulls(iowa_houses)
cleaned.loc[cleaned.MSZoning == "C (all)", "MSZoning"] = 'C'
cleaned.iloc[1379,42] = 'SBrkr'
cleaned.iloc[332,35] = 'GLQ'
cleaned = convert_cols(cleaned, cat_cols)
cleaned = convert_cols_num(cleaned, num_cols)
cleaned = make_zeros(cleaned, num_cols)
cleaned = binarize(cleaned, bin_cols)
cleaned = log_transf(cleaned, log_cols)
# impute function here

In [30]:
cleaned.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Absent,Reg,Lvl,AllPub,Inside,...,0,No,No,No,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Absent,Reg,Lvl,AllPub,FR2,...,0,No,No,No,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,Absent,IR1,Lvl,AllPub,Inside,...,0,No,No,No,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,Absent,IR1,Lvl,AllPub,Corner,...,0,No,No,No,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,Absent,IR1,Lvl,AllPub,FR2,...,0,No,No,No,0,12,2008,WD,Normal,250000


In [31]:
cleaned.dtypes

MSSubClass       category
MSZoning         category
LotFrontage       float64
LotArea             int64
Street           category
Alley            category
LotShape         category
LandContour      category
Utilities        category
LotConfig        category
LandSlope        category
Neighborhood     category
Condition1       category
Condition2       category
BldgType         category
HouseStyle         object
OverallQual         int64
OverallCond         int64
YearBuilt           int64
YearRemodAdd        int64
RoofStyle          object
RoofMatl           object
Exterior1st        object
Exterior2nd        object
MasVnrType         object
MasVnrArea        float64
ExterQual          object
ExterCond          object
Foundation         object
BsmtQual           object
                   ...   
BedroomAbvGr        int64
KitchenAbvGr        int64
KitchenQual        object
TotRmsAbvGrd        int64
Functional         object
Fireplaces          int64
FireplaceQu        object
GarageType  

In [32]:
cleaned.to_csv("cleanedKaggle.csv")