### Imports

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

### Functions to Read, Train, Validate and Submit

In [2]:
# Returns the train and test datasets and a sample of the submission file
def read_data():
    train = pd.read_csv('input/train.csv')
    test = pd.read_csv('input/test.csv')
    sample = pd.read_csv('input/sample_submission.csv')
    return train, test, sample

# Returns a trainned LGBMRegressor model
def trainlgbm(xtrain, ytrain, n_est=10):
    model = lgb.LGBMRegressor(n_estimators=n_est)
    return model.fit(xtrain, ytrain)

# Validation: print mean_squared_log_error and feature importances
def score(ytrue, ypred, model=None):
    score = mean_squared_log_error(ytrue, ypred)
    print(f'The MSLE score is: {score:.4f}\n')
    if model != None:
        featureimportances = model.feature_importances_
        print(f'The features importances are {featureimportances}\n')

# Make the submission file
def make_submission(model, sample, test, file_name='submission.csv'):
    test_prediction = model.predict(test)
    submission = sample.copy()
    submission['SalePrice'] = test_prediction
    submission.to_csv(file_name, index=False)

### Auxiliary functions

In [13]:
# Print the complete description of the columns
def columns_description():
    description = open('input/data_description.txt')
    print(description.read())
    description.close()

# Print columns names enumerated (one per line)
def columns_names(df):
    col_enum = enumerate(df.columns)
    for i, item in col_enum:
        print(i, item)

# Return a list with columns names that have 'int64' of 'float64' as its only data types
def columns_numeric(df, drop=True):
    col_enum = enumerate(df.columns)
    df_numeric = []
    for i, item in col_enum:
        if (df[item].dtype == 'int64') | (df[item].dtype == 'float64'):
            # print(i, item, '-->', train[item].dtype)
            df_numeric.append(item)
    if drop == True:
        df_numeric.pop(0) # drop 'Id'
        df_numeric.pop(-1) # drop 'SalePrice' (target)
    # print(df_numeric)
    return df_numeric

# Returns the reduced df
def df_subset(df, list_col_names):
    df = df[list_col_names]
    return df

# Print and returns the column names that have features importances > N
def fi_greater_than_N(model, train_set ,N): # train[train_numeric]
    fi_N = []
    for i in range(len(model.feature_importances_)):
        if model.feature_importances_[i] > N:
            print(f'{i}: ', list(train_set.columns)[i], model.feature_importances_[i])
            fi_N.append(list(train_set.columns)[i])
    return fi_N

# Print the name of each column and the first examples of it in each line
def cols_example(df):
    cols = df.columns
    for item in cols:
        print(f'{item:<15}: {df[item][0]:<10}, {df[item][1]:<10}')

### Functions for feature engineering

In [4]:
# Returns the dataframe without the categorical columns and with the encoded ones at the end
def encode_data(df, list_categorical_features):
    cat = list_categorical_features
    encoder = LabelEncoder()
    df_encoded = df[cat].apply(encoder.fit_transform)
    #new_df = df.drop([cat], axis=1)
    return df_encoded

# Prints the name of the column and the percentage of NaN values in it
def percentage_nan(df, list_cols):
    n=0
    print('Percentage of NaN values\n')
    for item in list_cols:
        df_isna = df[item].isna().astype('int')
        isna_number = sum(df_isna)
        perc = isna_number/len(df_isna)
        print(f'{item:<15}{perc * 100:.2f}%')

### Starting...

In [5]:
# Read and prepare data
train, test, sample = read_data()
ytrain_base = train['SalePrice']
xtrain_base = train.drop(['Id', 'SalePrice'], axis=1)
xtest_base = test.drop(['Id'], axis=1)

### Categorical Features
Analysing the data description file we can see that these are the categorical features. This is just a preliminary analysis, we still have datetime data and we didn't check the quantity of missing data, for example.

In [6]:
cat_feat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 
 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

In [7]:
xtrain_cat = xtrain_base[cat_feat]
xtrain_cat = xtrain_cat.fillna('missing')
xtest_cat = xtest_base[cat_feat]
xtest_cat = xtest_cat.fillna('missing')

### Numerical features

Here we have a lazy numerical features selection. We looked at the data and filtered as numerical the 'int64' and 'float64' columns. Here we have datetime data and categorical data that were already encoded (like grades 1-10).

In [17]:
num_feat = columns_numeric(xtrain_base, drop=False)
xtrain_num = xtrain_base[num_feat]
xtrain_num = xtrain_num.fillna(0)