In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename == 'train.csv':
            data = pd.read_csv(os.path.join(dirname, filename))
        elif filename == 'test.csv':
            test_data = pd.read_csv(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Check high correlations between variables
for column in data:
    dtype = data[column].dtype
    if dtype in ['int64', 'float64']:
        for column2 in data:
            dtype2 = data[column2].dtype
            if dtype2 == dtype:
                corr = data[column].corr(data[column2])
                if corr > 0.8 and corr < 0.999:
                    print('The correlation between {} and {} is {}!'.format(column, column2, corr))

The correlation between TotalBsmtSF and 1stFlrSF is 0.8195299750050337!
The correlation between 1stFlrSF and TotalBsmtSF is 0.8195299750050337!
The correlation between GrLivArea and TotRmsAbvGrd is 0.8254893743088426!
The correlation between TotRmsAbvGrd and GrLivArea is 0.8254893743088427!
The correlation between GarageCars and GarageArea is 0.8824754142814625!
The correlation between GarageArea and GarageCars is 0.8824754142814625!


In [3]:
# Remove attributes with high correlation to some other attribute (they describe the same thing)
# and attributes that have high chance of being similar (Exterior2nd in comparison to Exterior1st , Condition2 in comparison to Condition1)
data = data.drop(['Id', 'PoolQC', 'MiscFeature', 'Condition2', 'Exterior2nd', '1stFlrSF', 'TotRmsAbvGrd', 'GarageArea', 'Utilities', 'Street', 'Alley'], axis=1)
train_y = data['SalePrice']
train_x = data.drop('SalePrice', axis=1)

In [4]:
from sklearn import preprocessing
# Helper function to encode categorical labels
def encode_labels(dataset):
    label_encoder = preprocessing.LabelEncoder()
    for column in dataset:
        dtype = dataset[column].dtype
        if dtype not in ['int64', 'float64']:
            dataset[column] = label_encoder.fit_transform(dataset[column])
    return dataset

In [5]:
# Standardize and normalize the data
from sklearn.preprocessing import StandardScaler
train_x = encode_labels(train_x)
test_ids = test_data['Id']
common_cols = [col for col in set(data.columns).intersection(test_data.columns)]
test_data = test_data[common_cols]
train_x = train_x[common_cols]
sc = StandardScaler()
train_x = sc.fit_transform(train_x)
test_data = encode_labels(test_data)
test_data = sc.transform(test_data)  

In [6]:
# Training with XGBoost using stratified K fold and randomized search cross-validation
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
# split data into train and test sets
params = {
    'n_estimators': [100, 400, 800],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.05, 0.1, 0.20],
    'min_child_weight': [1, 10, 100]
    }

xgb = XGBRegressor(learning_rate=0.02, objective='reg:squarederror',
                    silent=True, nthread=1)
folds = 3
param_comb = 16

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)


random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=8, cv=skf.split(train_x,train_y), verbose=3, random_state=1001 )
random_search.fit(train_x, train_y)

Fitting 3 folds for each of 16 candidates, totalling 48 fits




Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f0b2da690d0>,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categorical=False,
                                          eval_metric=None, gamma=None,
                                          gpu_id=None, grow_policy=None,
                                          importance_type=None,
                                          interaction_co...
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          

In [7]:
# Perform predictions on test data
model = random_search
preds = model.predict(test_data)
submission = pd.DataFrame()
submission['Id'] = test_ids
submission['SalePrice'] = preds

In [8]:
# Save the submission file
from pathlib import Path  
filepath = Path('submission.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
submission.to_csv('submission.csv', sep=',', index=False)