In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

In [15]:
def read_data():
    train = pd.read_csv('input/train.csv')
    test = pd.read_csv('input/test.csv')
    sample = pd.read_csv('input/sample_submission.csv')
    return train, test, sample

# Print the complete description of the columns
def columns_description():
    description = open('input/data_description.txt')
    print(description.read())
    description.close()

# Print columns names enumerated (one per line)
def columns_names(df):
    col_enum = enumerate(df.columns)
    for i, item in col_enum:
        print(i, item)

# Return a list with columns names that have 'int64' of 'float64' as its only data types
def columns_numeric(df, drop=True):
    col_enum = enumerate(df.columns)
    df_numeric = []
    for i, item in col_enum:
        if (df[item].dtype == 'int64') | (df[item].dtype == 'float64'):
            # print(i, item, '-->', train[item].dtype)
            df_numeric.append(item)
    if drop == True:
        df_numeric.pop(0) # drop 'Id'
        df_numeric.pop(-1) # drop 'SalePrice' (target)
    # print(df_numeric)
    return df_numeric

# Returns the reduced df as an np.array
def df_subset(df, list_col_names):
    df = df[list_col_names]
    return np.array(df)

# Returns a trainned LGBMRegressor model
def trainlgbm(xtrain, ytrain, n_est=10):
    model = lgb.LGBMRegressor(n_estimators=n_est)
    return model.fit(xtrain, ytrain)

# Validation: print mean_squared_log_error and feature importances
def score(ytrue, ypred, model=None):
    score = mean_squared_log_error(ytrue, ypred)
    print(f'The MSLE score is: {score:.4f}\n')
    if model != None:
        featureimportances = model.feature_importances_
        print(f'The features importances are {featureimportances}\n')

# Make the submission file
def make_submission(model, sample, test, file_name='submission.csv'):
    test_prediction = model.predict(test)
    submission = sample.copy()
    submission['SalePrice'] = test_prediction
    submission.to_csv(file_name, index=False)

# Print and returns the column names that have features importances > N
def fi_greater_than_N(model, train_set ,N): # train[train_numeric]
    fi_N = []
    for i in range(len(model.feature_importances_)):
        if model.feature_importances_[i] > N:
            print(f'{i}: ', list(train_set.columns)[i], model.feature_importances_[i])
            fi_N.append(list(train_set.columns)[i])
    return fi_N

In [None]:
# Load the data
train, test, sample = read_data()

In [None]:
# Get the subsets that we want
col_num = columns_numeric(train) # This drops Id and SalePrice columns
xtrain = df_subset(train, col_num)
ytrain = df_subset(train, ['SalePrice'])
xtest = df_subset(test, col_num)


In [None]:
# Train the model
model1 = trainlgbm(xtrain, ytrain, n_est=1000)

In [14]:
# Validate
pred_xtrain = model1.predict(xtrain)
score(ytrain, pred_xtrain)


The MSLE score is: 0.0000



In [16]:
# Make the submission file
make_submission(model1, sample, xtest, file_name='sub1.csv')