In [398]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [399]:
train = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')
sample = pd.read_csv('/kaggle/input/home-data-for-ml-course/sample_submission.csv')

In [400]:
num_cols = train.select_dtypes(include=['int', 'float']).columns
num_cols

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [401]:
numeric_features = ['OverallQual', 'GrLivArea', 'GarageCars',
                    '2ndFlrSF', 'TotalBsmtSF', 'YearBuilt',
                    'BsmtFinSF1', 'Fireplaces', 'LotArea',
                    'OverallCond', 'ScreenPorch', 'YearRemodAdd', 
                    'HalfBath', '1stFlrSF','BsmtFullBath', 
                    'BsmtHalfBath', 'FullBath', 'BedroomAbvGr']

In [402]:
# train['TotalBsmtSF'].value_counts().sort_index() , 'BedroomAbvGr'

In [403]:
# train['BedroomAbvGr'].value_counts().sum()

In [404]:
def remove_outliers(X, y):
    data = pd.concat([X, y], axis=1)
    
    data = data.drop(data['GrLivArea'][data['GrLivArea'] > 4500].index)
    data = data.drop(data['TotalBsmtSF'][data['TotalBsmtSF'] > 2500].index)
    data = data.drop(data['LotArea'][data['LotArea'] > 50000].index)
    data = data.drop(data['BsmtFinSF1'][data['BsmtFinSF1'] > 2500].index)
    data = data.drop(data['1stFlrSF'][data['1stFlrSF'] > 3000].index)
#     data = data.drop(data['ScreenPorch'][data['ScreenPorch'] > 450].index)    
    data = data.drop(data['TotalBsmtSF'][data['TotalBsmtSF'] > 3000].index)
    
    new_y = data.pop('SalePrice')
    new_X = data
    return new_X, new_y

In [405]:
bad_cat_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']

# Remove bad features
other_cat_features = train.select_dtypes(include='O').columns.tolist()
for col in bad_cat_features:
    other_cat_features.remove(col)

In [406]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import cross_val_score

def score_data(X, y, random_state=42):
    l_model = LinearRegression()
    rf_model = RandomForestRegressor(random_state=random_state)
    gb_model = GradientBoostingRegressor(random_state=random_state)
    
    l_score = -1 * cross_val_score(l_model, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()
    rf_score = -1 * cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()
    gb_score = -1 * cross_val_score(gb_model, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()
    
    print(f'Data score:\nLinear regression: {l_score}\nRandom forest: {rf_score}\nGradient boosting: {gb_score}')

In [407]:
# numeric_features

In [408]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
train_data_for_imput = pd.concat([train[numeric_features], train[other_cat_features]], axis=1)
test_data_for_imput = pd.concat([test[numeric_features], test[other_cat_features]], axis=1)

imputed_train_data = pd.DataFrame(data=imputer.fit_transform(train_data_for_imput), 
                                  index=train.index, 
                                  columns=train_data_for_imput.columns)
imputed_test_data = pd.DataFrame(data=imputer.transform(test_data_for_imput), 
                                 index=test.index, 
                                 columns=test_data_for_imput.columns)

In [409]:
from sklearn.preprocessing import OneHotEncoder

OHE = OneHotEncoder(sparse=False)
train_ohe_df = pd.DataFrame(data=OHE.fit_transform(imputed_train_data[other_cat_features]), 
                            index=imputed_train_data.index)
test_ohe_df = pd.DataFrame(OHE.transform(imputed_test_data[other_cat_features]), 
                           index=imputed_test_data.index)

# In the latest version sklearn raise FutureWarning if columns names aren't strings
train_ohe_df.columns = train_ohe_df.columns.map(str)
test_ohe_df.columns = test_ohe_df.columns.map(str)

In [410]:
train_X = pd.concat([imputed_train_data[numeric_features].astype('int32'), train_ohe_df], axis=1)
train_y = train['SalePrice']
train_X, train_y = remove_outliers(train_X, train_y)

test_X = pd.concat([imputed_test_data[numeric_features].astype('int32'), test_ohe_df], axis=1)

# score_data(train_X, train_y)

In [411]:
from sklearn.preprocessing import RobustScaler

# Instantiate a StandardScaler object
scaler = RobustScaler()

# Scale the numeric features in train_X
train_X[numeric_features] = scaler.fit_transform(train_X[numeric_features])
test_X[numeric_features] = scaler.transform(test_X[numeric_features])

In [412]:
train_X.shape

(1443, 252)

In [413]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# # create a PCA object with desired number of components
# pca = PCA(n_components=len(train_X.columns))

# # fit the PCA model to the data
# pca.fit(train_X)

# # get the eigenvalues
# eigenvalues = pca.explained_variance_

# # create a scree plot
# plt.plot(range(1, len(eigenvalues)+1), eigenvalues, 'o-', linewidth=2)
# plt.xlabel('Principal Component')
# plt.ylabel('Eigenvalue')
# plt.title('Scree Plot')
# plt.show()

In [414]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# # create a PCA object with maximum number of components
# pca = PCA(n_components=len(train_X.columns))

# # fit the PCA model to the data
# pca.fit(train_X)

# # get the eigenvalues
# eigenvalues = pca.explained_variance_

# # calculate the proportion of variance explained by each component
# variance_proportions = eigenvalues / sum(eigenvalues)

# # create a scree plot
# plt.plot(range(1, len(eigenvalues)+1), variance_proportions, 'o-', linewidth=2)
# plt.xlabel('Principal Component')
# plt.ylabel('Proportion of Variance Explained')
# plt.title('Scree Plot')
# plt.show()

# # calculate the cumulative proportion of variance explained
# cumulative_variance = np.cumsum(variance_proportions)

# # find the number of components that explain at least 95% of the variance
# n_components = np.argmax(cumulative_variance >= 0.95) + 1
# print("Number of components to explain at least 95% of the variance:", n_components)


In [415]:
# model = GradientBoostingRegressor(learning_rate = 0.1, max_depth =  3, n_estimators = 200, random_state=42)
# model.fit(train_X, train_y)

In [416]:
# # Create a correlation matrix
# corr_matrix = train_X.corr().abs()

# # Set a threshold for dropping variables
# threshold = 0.8

# # Find variables with correlation greater than the threshold
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# # Drop the variables
# train_X.drop(to_drop, axis=1, inplace=True)
# test_X.drop(to_drop, axis=1, inplace=True)

In [417]:
# to_drop

In [418]:
# import pandas as pd
# import numpy as np

# # Concatenate the training features and outcome variable into a single DataFrame
# train_df = pd.concat([train_X, train_y], axis=1)

# # Set a threshold for dropping variables
# threshold = 0.1

# # Compute the correlation matrix with the outcome variable
# corr_matrix = train_df.corr().abs()
# corr_with_outcome = corr_matrix.iloc[:-1,-1]

# # Find variables with correlation less than the threshold
# to_drop = corr_with_outcome[corr_with_outcome < threshold].index

# # Drop the variables from the training features
# train_X.drop(to_drop, axis=1, inplace=True)
# test_X.drop(to_drop, axis=1, inplace=True)

In [423]:
# from xgboost import XGBRegressor
# from sklearn.metrics import mean_squared_error

# # Create an instance of XGBRegressor with desired hyperparameters
# model = XGBRegressor()

# # Fit the model to the training data
# model.fit(train_X, train_y)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [420]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(n_estimators=10000, loss_function='RMSE',
                              random_state=0, max_depth=5, 
                              verbose=False, subsample=0.5, random_strength=1.5)
model.fit(train_X, train_y)

In [421]:
y_pred = model.predict(test_X)

In [422]:
output = pd.DataFrame({'Id': test.Id,
                       'SalePrice': y_pred})
output.to_csv('submission.csv', index=False)