In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/neolen-house-price-prediction/sample_submission.csv
/kaggle/input/neolen-house-price-prediction/data_description.txt
/kaggle/input/neolen-house-price-prediction/train.csv
/kaggle/input/neolen-house-price-prediction/test.csv
/kaggle/input/neolen-house-price-prediction/house-prices-advanced-regression-techniques.zip


In [2]:
# Read csv files. And keep a copy of the original dataframes. tran_df and test_df will be used throughout this notebook for preprocessing and scaling.we need original test dataframe in the end 
or_train_df = pd.read_csv('/kaggle/input/neolen-house-price-prediction/train.csv')
or_test_df = pd.read_csv('/kaggle/input/neolen-house-price-prediction/test.csv')

train_df = or_train_df
test_df = or_test_df

In [3]:
# Dropping 'Id' column as it has no use in training
train_df = train_df.drop(['Id'], axis=1)
test_df = test_df.drop(['Id'], axis=1)

## Replace NaNs
**While replacing we have to make sure to replace it with '0' (character) in case of a string column and 0 (number) if otherwise**

In [4]:
str_columns = []
num_columns = []

def convert_to_str(df):
    for col in df.columns:
        if (df[col].dtype == np.int64 or df[col].dtype == np.float64):
            df[col] = df[col].fillna(0)
            if col not in num_columns:
                num_columns.append(col)
        else:
            df[col] = df[col].fillna('0')
            if col not in str_columns:
                str_columns.append(col)
    return df

train_df = convert_to_str(train_df)
test_df = convert_to_str(test_df)

In [5]:
train_df.head(n=2)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,0,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,0,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500


## Encoding string values in dataframe to numbers

In [6]:
for col in str_columns:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.fit_transform(test_df[col])

In [7]:
# Add a dummy SalePrice column to test dataframe to scale it 
test_df['SalePrice'] = train_df['SalePrice'].iloc[:201]

## Standardization of values

In [8]:
scaler = StandardScaler()
scaler.fit(train_df)

StandardScaler()

In [9]:
train_data = scaler.transform(train_df)
test_data = scaler.transform(test_df)

In [10]:
train_df = pd.DataFrame(train_data, columns=train_df.columns)
test_df = pd.DataFrame(test_data, columns=test_df.columns)

In [11]:
test_df = test_df.drop(['SalePrice'], axis=1)

In [12]:
test_df.head(n=2)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,-0.871918,-0.036322,0.206318,-0.074542,-14.451067,-0.238904,0.756592,0.310085,-0.028194,-0.618896,...,1.208226,-0.056248,-0.051004,-0.451289,-0.203162,-0.087433,0.255175,0.151227,-2.859626,-0.667876
1,0.072887,-0.036322,-1.701485,1.362959,-14.451067,-0.238904,0.048377,0.310085,-0.028194,-1.231949,...,-0.271282,-0.056248,-0.051004,-0.451289,-0.203162,-0.087433,-0.117523,0.90676,-2.859626,-0.667876


In [13]:
train_data = train_df[test_df.columns]
train_labels = train_df['SalePrice']

## XGBoost regressor model 

In [14]:
model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [15]:
model.fit(train_data, train_labels)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.1, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.100000001, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [16]:
predictions = model.predict(test_df)

In [17]:
test_df['SalePrice'] = predictions
test_df = scaler.inverse_transform(test_df)
test_df = pd.DataFrame(test_df, columns=train_df.columns)

In [18]:
# Create results dataframe
results = pd.DataFrame()
results['Id'] = or_test_df['Id']
results['SalePrice'] = test_df['SalePrice']

In [19]:
results.head()

Unnamed: 0,Id,SalePrice
0,1260,141110.494646
1,1261,173641.485872
2,1262,120053.15642
3,1263,139107.531678
4,1264,170916.728926


In [20]:
results.to_csv('submissions.csv', index=False)