In [1]:
import random
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import torch,torchvision
from torch.nn import *
from torch.optim import *
# Preproccessing
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    OneHotEncoder,
    Normalizer,
    Binarizer
)
# Decomposition
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
# Feature Selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
# Model Eval
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score,precision_score,f1_score,recall_score
# Models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,VotingRegressor,BaggingRegressor,RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoost,CatBoostRegressor
from xgboost import XGBRegressor,XGBRFRegressor
from flaml import AutoML
# Other
import pickle
import wandb

PROJECT_NAME = 'House-Prices-Advanced-Regression-Techniques-V9'
device = 'cuda'
np.random.seed(21)
random.seed(21)
torch.manual_seed(21)

<torch._C.Generator at 0x7fedfb65b890>

In [2]:
def make_submission(model):
    pass

In [3]:
def valid(model,X,y,valid=False):
    preds = model.predict(X)
    if valid:
        results = {
            'val mean_absolute_error':mean_absolute_error(y_true=y,y_pred=preds),
            'val mean_squared_error':mean_squared_error(y_true=y,y_pred=preds),
        }
    else:
        results = {
            'mean_absolute_error':mean_absolute_error(y_true=y,y_pred=preds),
            'mean_squared_error':mean_squared_error(y_true=y,y_pred=preds),
        }
    return results

In [4]:
def train(model,X_train,X_test,y_train,y_test,name):
    wandb.init(project=PROJECT_NAME,name=name)
    model.fit(X_train,y_train)
    wandb.log(valid(model,X_train,y_train))
    wandb.log(valid(model,X_test,y_test,True))
    make_submission(model)
    return model

In [5]:
def object_to_int(data,col):
    data_col = data[col].to_dict()
    idx = -1
    labels_and_int_index = {}
    for data_col_vals in data_col.values():
        if data_col_vals not in labels_and_int_index.keys():
            idx += 1
            labels_and_int_index[data_col_vals] = idx
    new_data = []
    for data_col_vals in data_col.values():
        new_data.append(labels_and_int_index[data_col_vals])
    data[col] = new_data
    return data,idx,labels_and_int_index,new_data

In [6]:
def fe(data,col,quantile_max_num=0.99,quantile_min_num=0.05):
    max_num = data[col].quantile(quantile_max_num)
    min_num = data[col].quantile(quantile_min_num)
    print(max_num)
    print(min_num)
    data = data[data[col] < max_num]
    data = data[data[col] > min_num]
    return data

In [7]:
def decomposition(X,pca=False,kernal_pca=False):
    if pca:
        pca = PCA()
        X = pca.fit_transform(X)
    if kernal_pca:
        kernal_pca = KernelPCA()
        X = kernal_pca.fit_transform(X)
    return X

In [8]:
def feature_selection_prep_data(model,X,y,select_from_model=False,variance_threshold=False,select_k_best=False,rfecv=False):
    if select_from_model:
        transform = SelectFromModel(estimator=model.fit(X, y))
        X = transform.transform(X)
    if variance_threshold:
        transform = VarianceThreshold()
        X = transform.fit_transform(X)
    if select_k_best:
        X = SelectKBest(chi2, k='all').fit_transform(X, y)
    if rfecv:
        X = RFECV(model, step=1, cv=5).fit(X, y)
        X = X.transform(X)
    return X

In [9]:
def prep_data(X,transformer):
    mct = make_column_transformer(
        (transformer,list(X.columns)),
        remainder='passthrough'
    )
    X = mct.fit_transform(X)
    return X

In [10]:
data = pd.read_csv('./data/train.csv')

In [11]:
preproccessings = [StandardScaler,RobustScaler,MinMaxScaler,MaxAbsScaler,OneHotEncoder,Normalizer,Binarizer]

In [12]:
models = [
    ['KNeighborsRegressor',KNeighborsRegressor],
    ['LogisticRegression',LogisticRegression],
    ['LogisticRegressionCV',LogisticRegressionCV],
    ['DecisionTreeRegressor',DecisionTreeRegressor],
    ['GradientBoostingRegressor',GradientBoostingRegressor],
    ['AdaBoostRegressor',AdaBoostRegressor],
    ['RandomForestRegressor',RandomForestRegressor],
    ['BaggingRegressor',BaggingRegressor],
    ['GaussianNB',GaussianNB],
    ['ExtraTreesRegressor',ExtraTreesRegressor],
    ['CatBoost',CatBoost],
    ['CatBoostRegressor',CatBoostRegressor],
    ['XGBRegressor',XGBRegressor],
    ['XGBRFRegressor',XGBRFRegressor],
    ['ExtraTreesRegressor',ExtraTreesRegressor],
]

In [13]:
X = data.drop('SalePrice',axis=1)
y = data['SalePrice']

In [14]:
str_cols = []
int_cols = []

In [15]:
for col_name,num_of_missing_rows,dtype in zip(list(X.columns),X.isna().sum(),X.dtypes):
    if dtype == object:
        str_cols.append(col_name)
    else:
        int_cols.append(col_name)

In [16]:
for str_col in str_cols:
    X,idx,labels_and_int_index,new_data = object_to_int(X,str_col)

In [17]:
X.head()

   Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
0   1          60         0         65.0     8450       0      0         0   
1   2          20         0         80.0     9600       0      0         0   
2   3          60         0         68.0    11250       0      0         1   
3   4          70         0         60.0     9550       0      0         1   
4   5          60         0         84.0    14260       0      0         1   

   LandContour  Utilities  ...  ScreenPorch  PoolArea  PoolQC  Fence  \
0            0          0  ...            0         0       0      0   
1            0          0  ...            0         0       0      0   
2            0          0  ...            0         0       0      0   
3            0          0  ...            0         0       0      0   
4            0          0  ...            0         0       0      0   

   MiscFeature  MiscVal  MoSold  YrSold  SaleType  SaleCondition  
0            0        0       2

In [18]:
nan_cols = []
for col_name,num_of_missing_rows,dtype in zip(list(X.columns),X.isna().sum(),X.dtypes):
    if num_of_missing_rows > 0:
        nan_cols.append(col_name)

In [19]:
for nan_col in nan_cols:
    X[nan_col].fillna(X[nan_col].median(),inplace=True)

In [20]:
nan_cols = []
for col_name,num_of_missing_rows,dtype in zip(list(X.columns),X.isna().sum(),X.dtypes):
    if num_of_missing_rows > 0:
        nan_cols.append(col_name)

In [21]:
# train(GradientBoostingRegressor(),X,X,y,y,name='baseline-without-fe')

In [22]:
X_old = X.copy()

In [23]:
# for col_name in list(X.columns):
#     try:
#         X = X_old.copy()
#         X = fe(X,col_name)
#         train(GradientBoostingRegressor(),X,X,y,y,name=f'baseline-with-fe-{col_name}')
#     except:
#         print('*'*50)
#         print('*'*50)

In [24]:
# X = X_old.copy()

In [25]:
X_corr = X_old.corr()

In [26]:
keep_cols = []

In [27]:
# for key,val in zip(X_corr.to_dict().keys(),X_corr.to_dict().values()):
#     for val_key,val_vals in zip(val.keys(),val.values()):
#         if val_key == key:
#             pass
#         else:
#             if val_vals > 0.0:
#                 if val_key not in keep_cols:
#                     print(val_vals)
#                     keep_cols.append(val_key)

In [28]:
# fig,ax = plt.subplots(figsize=(25,12))
# ax = sns.heatmap(X_corr,annot=True,linewidths=0.5,fmt='.2f',cmap='YlGnBu')

In [29]:
# keep_cols

In [30]:
# len(keep_cols)

In [31]:
X.head()

   Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
0   1          60         0         65.0     8450       0      0         0   
1   2          20         0         80.0     9600       0      0         0   
2   3          60         0         68.0    11250       0      0         1   
3   4          70         0         60.0     9550       0      0         1   
4   5          60         0         84.0    14260       0      0         1   

   LandContour  Utilities  ...  ScreenPorch  PoolArea  PoolQC  Fence  \
0            0          0  ...            0         0       0      0   
1            0          0  ...            0         0       0      0   
2            0          0  ...            0         0       0      0   
3            0          0  ...            0         0       0      0   
4            0          0  ...            0         0       0      0   

   MiscFeature  MiscVal  MoSold  YrSold  SaleType  SaleCondition  
0            0        0       2

In [32]:
X_old = X.copy()

In [33]:
for preproccessing in preproccessings:
    X = X_old.copy()
    preproccessing = preproccessing()
    X = preproccessing.fit_transform(X)
    train(GradientBoostingRegressor(),X,X,y,y,name=f'{preproccessing}-preproccessing')

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
mean_absolute_error,13632.14766
mean_squared_error,343084169.06336
_runtime,7.0
_timestamp,1629881536.0
_step,1.0
val mean_absolute_error,13632.14766
val mean_squared_error,343084169.06336


0,1
mean_absolute_error,▁
mean_squared_error,▁
_runtime,▁▁
_timestamp,▁▁
_step,▁█
val mean_absolute_error,▁
val mean_squared_error,▁


In [34]:
X = X_old.copy()

In [35]:
X = X_old.copy()

In [36]:
X = decomposition(True,False)
train(GradientBoostingRegressor(),X,X,y,y,name=f'PCA=True-kernal_pca=False-decomposition')