## 1. Loading Data and Packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import warnings
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import skew
from scipy import stats
from scipy.stats.stats import pearsonr
from scipy.stats import norm
from collections import Counter
from sklearn.linear_model import LinearRegression,LassoCV, Ridge, LassoLarsCV,ElasticNetCV
from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler, LabelEncoder
warnings.filterwarnings('ignore')
sns.set(style='white', context='notebook', palette='deep')
%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

import shap
import xgboost as xgb
from catboost import Pool
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

In [None]:
# Load train and Test set
train = pd.read_csv("../input/massp-inclass1/massp-housing-prices-in-melbourne/train.csv")
test = pd.read_csv("../input/massp-inclass1/massp-housing-prices-in-melbourne/test.csv")

In [None]:
# Check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(train.shape))
print("The test data size before dropping Id feature is : {} ".format(test.shape))

# Save the 'Id' column
train_ID = train['id']
test_ID = test['id']

# Now drop the 'Id' column since it's unnecessary for the prediction process.
train.drop("id", axis = 1, inplace = True)
test.drop("id", axis = 1, inplace = True)

# Check data size after dropping the 'Id' variable
print("\nThe train data size after dropping id feature is : {} ".format(train.shape)) 
print("The test data size after dropping id feature is : {} ".format(test.shape))

In [None]:
train.columns

## 2. Analyzing the TARGET Variable (Price)

In [None]:
# Getting Description
train['Price'].describe()

In [None]:
# Plot Histogram
sns.distplot(train['Price'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['Price'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Price distribution')

fig = plt.figure()
res = stats.probplot(train['Price'], plot=plt)
plt.show()

print("Skewness: %f" % train['Price'].skew())
print("Kurtosis: %f" % train['Price'].kurt())

## 3. Multivariable Analysis

In [None]:
# Checking Categorical Data
train.select_dtypes(include=['object']).columns

In [None]:
# Checking Numerical Data
train.select_dtypes(include=['int64','float64']).columns

In [None]:
cat = len(train.select_dtypes(include=['object']).columns)
num = len(train.select_dtypes(include=['int64','float64']).columns)
print('Total Features: ', cat, 'categorical', '+',
      num, 'numerical', '=', cat+num, 'features')

## 4. Impute Missing Data and Clean Data

In [None]:
def missing_values_table(df: pd.DataFrame) -> pd.DataFrame:
    mis_val = df.isnull().sum()

    mis_val_percent = 100 * df.isnull().sum() / len(df)

    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    mis_val_table_ren_columns = mis_val_table.rename(columns={
        0: 'Missing Values',
        1: '% of Total Values'
    })

    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values(
            '% of Total Values', ascending=False).round(1)
    return mis_val_table_ren_columns

In [None]:
# Combining Datasets
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.Price.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['Price'], axis=1, inplace=True)
print("Train data size is : {}".format(train.shape))
print("Test data size is : {}".format(test.shape))
print("Combined dataset size is : {}".format(all_data.shape))

In [None]:
all_data['YearBuilt'] = 2021 - all_data['YearBuilt']

In [None]:
# Find Missing Ratio of Dataset
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data

### Imputing Missing Values

In [None]:
numeric_nan = all_data[missing_values_table(all_data).index].select_dtypes(include=[np.number])
categotical_nan = all_data[missing_values_table(all_data).index].select_dtypes(exclude=[np.number])

In [None]:
for feature in numeric_nan.columns:
    all_data[feature]= all_data[feature].fillna(0)
    
for feature in categotical_nan.columns:
    all_data[feature]= all_data[feature].fillna("None")

In [None]:
missing_values_table(all_data)

## 5. Feature Transformation/Engineering

In [None]:
# Types of Features
categorical_cols = all_data.dtypes[all_data.dtypes=='object']
numeric_cols = (all_data.dtypes[all_data.dtypes=='float'] + all_data.dtypes[all_data.dtypes=='int']).index

# The number of categorical features
unique_in_object_cols = all_data[categorical_cols.index].nunique()
print(unique_in_object_cols)

In [None]:
# one hot encoding
all_data = pd.get_dummies(all_data, columns = ['CouncilArea', 'Method', 'Type'])
# label encoding
for col in ['Suburb', 'SellerG', 'Regionname']: 
    all_data[col] = LabelEncoder().fit_transform(all_data[col])

In [None]:
all_data.columns

In [None]:
# drop cols
all_data = all_data.drop(columns = ['Suburb','Address', 'SellerG', 'Date', 'Regionname'])

In [None]:
categorical_cols = all_data.dtypes[all_data.dtypes=='object']
categorical_cols

### Fixing "skewed" features.
Here, we fix all of the skewed data to be more normal so that our models will be more accurate when making predictions.

In [None]:
# We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
target_log = np.log1p(train["Price"])

#Check the new distribution 
sns.distplot(target_log, fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['Price'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Price distribution')

fig = plt.figure()
res = stats.probplot(target_log, plot=plt)
plt.show()

y_train = train.Price.values

print("Skewness: %f" % target_log.skew())
print("Kurtosis: %f" % target_log.kurt())

In [None]:
train_set = all_data[:ntrain]
test_set = all_data[ntrain:]

## 6. Modeling and Predictions

### CatBoost

In [None]:
# Creation of the RMSE metric:
    
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, train_set, target_log, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

In [None]:
# Train-Test split the data

X_train,X_val,y_train,y_val = train_test_split(train_set, target_log, test_size = 0.2, random_state=42)

# Cat Boost Regressor

cat = CatBoostRegressor()
cat_model = cat.fit(X_train,y_train, eval_set = (X_val,y_val), plot=True, verbose = 0)

In [None]:
cat_pred = cat_model.predict(X_val)
cat_score = rmse(y_val, cat_pred)
cat_score

In [None]:
# Preforming a Random Grid Search to find the best combination of parameters

grid = {'iterations': [1000,6000],
        'learning_rate': [0.05, 0.005, 0.0005],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 9]}

final_model = CatBoostRegressor()
randomized_search_result = final_model.randomized_search(grid,
                                                   X = X_train,
                                                   y= y_train,
                                                   verbose = False,
                                                   plot=True)

In [None]:
# Final Cat-Boost Regressor

params = {'iterations': 6000,
          'learning_rate': 0.005,
          'depth': 4,
          'l2_leaf_reg': 1,
          'eval_metric':'RMSE',
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 42}
         
cat_f = CatBoostRegressor(**params)
cat_model_f = cat_f.fit(X_train,y_train,
                     eval_set = (X_val,y_val),
                     plot=True,
                     verbose = False)

catf_pred = cat_model_f.predict(X_val)
catf_score = rmse(y_val, catf_pred)

In [None]:
catf_score

### XGBRegressor

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(train_set, target_log, test_size=0.2, random_state=42)


# sc = MinMaxScaler(feature_range=(-1, 1))
# x_train = sc.fit_transform(x_train)
# x_valid = sc.fit_transform(x_valid)

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
d_test = xgb.DMatrix(test_set)



params = {
        'objective':'reg:linear',
#         'n_estimators': 50,
        'booster':'gbtree',
        'max_depth':2,
        'eval_metric':'rmse',
        'learning_rate':0.1, 
        'min_child_weight':1,
        'subsample':0.80,
        'colsample_bytree':0.81,
        'seed':45,
        'reg_alpha':1,#1e-03,
        'reg_lambda':0,
        'gamma':0,
        'nthread':-1

}


watchlist = [(d_train, 'train'), (d_valid, 'valid')]

clf = xgb.train(params, d_train, 2000,  watchlist, early_stopping_rounds=300, maximize=False, verbose_eval=10)

p_test = clf.predict(d_test)

In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_train = scalar.fit_transform(train_set)
X_test = scalar.transform(test_set)

In [None]:
import xgboost
clf = xgboost.XGBRegressor()
clf.fit(X_train, target_log)

In [None]:
y_pred = clf.predict(X_test)

### Submission

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['Price'] = np.expm1(y_pred)
sub.to_csv('submission.csv',index=False)

In [None]:
sub.head()