### Imports

In [47]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

### Functions to Read, Train, Validate and Submit

In [48]:
# Returns the train and test datasets and a sample of the submission file
def read_data():
    train = pd.read_csv('input/train.csv')
    test = pd.read_csv('input/test.csv')
    sample = pd.read_csv('input/sample_submission.csv')
    return train, test, sample

# Returns a trainned LGBMRegressor model
def trainlgbm(xtrain, ytrain, n_est=10):
    model = lgb.LGBMRegressor(n_estimators=n_est)
    return model.fit(xtrain, ytrain)

# Validation: print mean_squared_log_error and feature importances
def score(ytrue, ypred, model=None):
    score = mean_squared_log_error(ytrue, ypred)
    print(f'The MSLE score is: {score:.4f}\n')
    if model != None:
        featureimportances = model.feature_importances_
        print(f'The features importances are {featureimportances}\n')

# Make the submission file
def make_submission(model, sample, test, file_name='submission.csv'):
    test_prediction = model.predict(test)
    submission = sample.copy()
    submission['SalePrice'] = test_prediction
    submission.to_csv(file_name, index=False)

### Auxiliary functions

In [49]:
# Print the complete description of the columns
def columns_description():
    description = open('input/data_description.txt')
    print(description.read())
    description.close()

# Print columns names enumerated (one per line)
def columns_names(df):
    col_enum = enumerate(df.columns)
    for i, item in col_enum:
        print(i, item)

# Return a list with columns names that have 'int64' of 'float64' as its only data types
def columns_numeric(df, drop=True):
    col_enum = enumerate(df.columns)
    df_numeric = []
    for i, item in col_enum:
        if (df[item].dtype == 'int64') | (df[item].dtype == 'float64'):
            # print(i, item, '-->', train[item].dtype)
            df_numeric.append(item)
    if drop == True:
        df_numeric.pop(0) # drop 'Id'
        df_numeric.pop(-1) # drop 'SalePrice' (target)
    # print(df_numeric)
    return df_numeric

# Returns the reduced df
def df_subset(df, list_col_names):
    df = df[list_col_names]
    return df

# Print and returns the column names that have features importances > N
def fi_greater_than_N(model, train_set ,N): # train[train_numeric]
    fi_N = []
    for i in range(len(model.feature_importances_)):
        if model.feature_importances_[i] > N:
            print(f'{i}: ', list(train_set.columns)[i], model.feature_importances_[i])
            fi_N.append(list(train_set.columns)[i])
    return fi_N

# Print the name of each column and the first examples of it in each line
def cols_example(df):
    cols = df.columns
    for item in cols:
        print(f'{item:<15}: {df[item][0]:<10}, {df[item][1]:<10}')

# Find duplicate column names
def find_dup(list_1, list_2):
    return set(list_1) & set(list_2)

### Functions for feature engineering

In [50]:
# Returns the dataframe without the categorical columns and with the encoded ones at the end
def encode_data(df, list_categorical_features):
    cat = list_categorical_features
    encoder = LabelEncoder()
    df_encoded = df[cat].apply(encoder.fit_transform)
    #new_df = df.drop([cat], axis=1)
    return df_encoded

# Prints the name of the column and the percentage of NaN values in it
def percentage_nan(df, list_cols):
    n=0
    print('Percentage of NaN values\n')
    for item in list_cols:
        df_isna = df[item].isna().astype('int')
        isna_number = sum(df_isna)
        perc = isna_number/len(df_isna)
        print(f'{item:<15}{perc * 100:.2f}%')

### Starting...

In [51]:
# Read and prepare data
train, test, sample = read_data()
ytrain_base = train['SalePrice']
ytrain = ytrain_base # no changes in the target
xtrain_base = train.drop(['Id', 'SalePrice'], axis=1)
xtest_base = test.drop(['Id'], axis=1)

### Categorical Features
Analysing the data description file we can see that these are the categorical features. This is just a preliminary analysis, we still have datetime data and we didn't check the quantity of missing data, for example.

In [52]:
cat_feat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 
 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

In [53]:
xtrain_cat = xtrain_base[cat_feat]
xtrain_cat = xtrain_cat.fillna('missing')

xtest_cat = xtest_base[cat_feat]
xtest_cat = xtest_cat.fillna('missing')

### Numerical features

Here we have a lazy numerical features selection. I looked at the data and filtered as numerical the 'int64' and 'float64' columns. We also have datetime data and categorical data that were already encoded (like grades 1-10).

In [54]:
num_feat = columns_numeric(xtrain_base, drop=False)
xtrain_num = xtrain_base[num_feat]
xtrain_num = xtrain_num.fillna(0)

xtest_num = xtest_base[num_feat]
xtest_num = xtest_num.fillna(0)

### Duplicate column names
Because of the very simple way I separated numerical features and categorical features some columns could be in both. Let's put them only in the categorical features part.

In [55]:
find_dup(xtrain_cat.columns, xtrain_num.columns)

{'MSSubClass'}

In [56]:
xtrain_num = xtrain_num.drop(['MSSubClass'], axis=1)
xtest_num = xtest_num.drop(['MSSubClass'], axis=1)

### Encoding categorical features

In [57]:
xtrain_cat.head()

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,60,RL,Pave,missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,Attchd,RFn,TA,TA,Y,missing,missing,missing,WD,Normal
1,20,RL,Pave,missing,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,Attchd,RFn,TA,TA,Y,missing,missing,missing,WD,Normal
2,60,RL,Pave,missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,Attchd,RFn,TA,TA,Y,missing,missing,missing,WD,Normal
3,70,RL,Pave,missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,Detchd,Unf,TA,TA,Y,missing,missing,missing,WD,Abnorml
4,60,RL,Pave,missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,Attchd,RFn,TA,TA,Y,missing,missing,missing,WD,Normal


In [58]:
xtrain_cat = encode_data(xtrain_cat, cat_feat)
xtest_cat = encode_data(xtest_cat, cat_feat)
print(xtrain_cat.head())

MSSubClass  MSZoning  Street  Alley  LotShape  LandContour  Utilities  \
0           5         3       1      2         3            3          0   
1           0         3       1      2         3            3          0   
2           5         3       1      2         0            3          0   
3           6         3       1      2         0            3          0   
4           5         3       1      2         0            3          0   

   LotConfig  LandSlope  Neighborhood  ...  GarageType  GarageFinish  \
0          4          0             5  ...           1             1   
1          2          0            24  ...           1             1   
2          4          0             5  ...           1             1   
3          0          0             6  ...           5             2   
4          2          0            15  ...           1             1   

   GarageQual  GarageCond  PavedDrive  PoolQC  Fence  MiscFeature  SaleType  \
0           4           4         

In [59]:
xtrain = xtrain_num.join(xtrain_cat)
xtest = xtest_num.join(xtest_cat)
xtrain.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,65.0,8450,7,5,2003,2003,196.0,706,0,150,...,1,1,4,4,2,3,4,4,8,4
1,80.0,9600,6,8,1976,1976,0.0,978,0,284,...,1,1,4,4,2,3,4,4,8,4
2,68.0,11250,7,5,2001,2002,162.0,486,0,434,...,1,1,4,4,2,3,4,4,8,4
3,60.0,9550,7,5,1915,1970,0.0,216,0,540,...,5,2,4,4,2,3,4,4,8,0
4,84.0,14260,8,5,2000,2000,350.0,655,0,490,...,1,1,4,4,2,3,4,4,8,4


### Model training, evaluation, test prediction and submission file

In [60]:
model = trainlgbm(xtrain, ytrain, n_est=1000)

In [61]:
valid_predict = model.predict(xtrain)

In [62]:
score(ytrain, valid_predict, model)

The MSLE score is: 0.0000

The features importances are [1195 1840  456  230  791  879 1134 1742  154 1418 1765 2007 1233   18
 1883  128   18  201   71  113   43  402  160  773   88 1808  736 1518
  332   40   79    0   14 1022  472  183  115    0   23  188   63    0
  104   10  727  341    0   80   54   78   43  279  277  139  264   50
   45  139   43  344  172   98   17  174   47   22  140   41  183  101
  146   14   32   46    0   84    4  164  163]



In [63]:
test_predict = model.predict(xtest)

In [64]:
make_submission(model, sample, xtest)