In [1]:
import random
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import torch,torchvision
from torch.nn import *
from torch.optim import *
# Preproccessing
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    OneHotEncoder,
    Normalizer,
    Binarizer
)
# Decomposition
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
# Feature Selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
# Model Eval
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score,precision_score,f1_score,recall_score
# Models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,VotingRegressor,BaggingRegressor,RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoost,CatBoostRegressor
from xgboost import XGBRegressor,XGBRFRegressor
from flaml import AutoML
# Other
import pickle
import wandb

PROJECT_NAME = 'House-Prices-Advanced-Regression-Techniques-V11'
device = 'cuda'

In [2]:
torch.manual_seed(469)
np.random.seed(469)
random.seed(469)

In [3]:
data = pd.read_csv('./data/train.csv')

In [4]:
def valid(model,X,y,valid=False):
    preds = model.predict(X)
    if valid:
        results = {
            'val mean_absolute_error':mean_absolute_error(y_true=y,y_pred=preds),
            'val mean_squared_error':mean_squared_error(y_true=y,y_pred=preds),
        }
    else:
        results = {
            'mean_absolute_error':mean_absolute_error(y_true=y,y_pred=preds),
            'mean_squared_error':mean_squared_error(y_true=y,y_pred=preds),
        }
    return results

In [5]:
def fe(data,col):
    max_num = data[col].quantile(0.95)
    min_num = data[col].quantile(0.05)
    data = data[data[col] > max_num]
    data = data[data[col] < min_num]
    return data

In [6]:
def object_to_int(data,col):
    data_col = data[col].to_dict()
    idx = -1
    labels_and_int_index = {}
    for data_col_vals in data_col.values():
        if data_col_vals not in labels_and_int_index.keys():
            idx += 1
            labels_and_int_index[data_col_vals] = idx
    new_data = []
    for data_col_vals in data_col.values():
        new_data.append(labels_and_int_index[data_col_vals])
    data[col] = new_data
    return data,idx,labels_and_int_index,new_data

In [7]:
def make_submission(model,name):
    data = pd.read_csv('./data/test.csv')
    ids = data['Id']
    for col,missing in zip(list(data.columns),data.isna().sum()):
        if missing > 0:
            try:
                data[col] = data[col].fillna(data[col].median())
            except:
                i = data[col].value_counts().to_dict()
                data[col] = data[col].fillna(i[list(i.keys())[0]])
    one_hot_cols = []
    for col,dtype in zip(list(data.columns),data.dtypes):
        if dtype == object:
            try:
                data[col].astype(float)
            except:
                one_hot_cols.append(col)
    data = mct.transform(data.astype(str))
    data = data.astype(float)
    data = data.toarray()
    preds = model.predict(data)
    df = pd.DataFrame({'Id':ids,'SalePrice':preds})
    df.to_csv(f'./submissions/{name}.csv',index=False)

In [8]:
def train(model,X_train,X_test,y_train,y_test,name):
    wandb.init(project=PROJECT_NAME,name=name)
    model.fit(X_train,y_train)
    wandb.log(valid(model,X_train,y_train,True))
    wandb.log(valid(model,X_train,y_train,False))
    make_submission(model,name)
    return model

In [9]:
for col,missing in zip(list(data.columns),data.isna().sum()):
    if missing > 0:
        try:
            data[col] = data[col].fillna(data[col].median())
        except:
            i = data[col].value_counts().to_dict()
            data[col] = data[col].fillna(i[list(i.keys())[1]])

In [10]:
one_hot_cols = []

In [11]:
for col,dtype in zip(list(data.columns),data.dtypes):
    if dtype == object:
        try:
            data[col].astype(float)
        except:
            one_hot_cols.append(col)

In [12]:
X = data.drop('SalePrice',axis=1)
y = data['SalePrice']

In [13]:
mct = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),one_hot_cols),
    remainder='passthrough'
)
X = mct.fit_transform(X.astype(str))
X = X.toarray()

In [14]:
X.shape

(1460, 305)

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.0625)

In [16]:
# train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,'baseline')

In [17]:
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

In [18]:
X_train_old = X_train.copy()

In [19]:
# pca = PCA()
# X_train =pca.fit_transform(X_train)
# train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,'PCA')

In [20]:
X_train = X_train_old.copy()

In [21]:
# pca = KernelPCA()
# X_train =pca.fit_transform(X_train)
# train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,'KernelPCA')

In [22]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel

In [23]:
# X_train = X_train_old.copy()
# pca = SelectKBest()
# X_train =pca.fit_transform(X_train,y_train)
# train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,'SelectKBest')

In [24]:
# X_train = X_train_old.copy()
# pca = RFECV(GradientBoostingRegressor(),step=1, cv=1)
# X_train =pca.fit_transform(X_train,y_train)
# train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,'RFECV')

In [25]:
# X_train = X_train_old.copy()
# pca = SelectFromModel(GradientBoostingRegressor())
# X_train =pca.fit_transform(X_train,y_train)
# train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,'SelectFromModel')

In [26]:
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    OneHotEncoder,
    Normalizer,
    Binarizer
)

In [27]:
preprocessings = [    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    OneHotEncoder,
    Normalizer,
    Binarizer]

In [29]:
# for preprocessing in preprocessings:
#     X = data.drop('SalePrice',axis=1)
#     y = data['SalePrice']
#     mct = make_column_transformer(
#         (preprocessing(),one_hot_cols),
#         remainder='passthrough'
#     )
#     X = mct.fit_transform(X.astype(str))
#     X = X.toarray()
#     X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.0625)
#     train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,f'{preprocessing}-preprocessing')

In [30]:
models = [
    ['KNeighborsRegressor',KNeighborsRegressor],
    ['LogisticRegression',LogisticRegression],
    ['LogisticRegressionCV',LogisticRegressionCV],
    ['DecisionTreeRegressor',DecisionTreeRegressor],
    ['GradientBoostingRegressor',GradientBoostingRegressor],
    ['AdaBoostRegressor',AdaBoostRegressor],
    ['RandomForestRegressor',RandomForestRegressor],
    ['BaggingRegressor',BaggingRegressor],
    ['GaussianNB',GaussianNB],
    ['ExtraTreesRegressor',ExtraTreesRegressor],
    ['CatBoost',CatBoost],
    ['CatBoostRegressor',CatBoostRegressor],
    ['XGBRegressor',XGBRegressor],
    ['XGBRFRegressor',XGBRFRegressor],
    ['ExtraTreesRegressor',ExtraTreesRegressor],
]

In [31]:
for model in models:
    train(model[1](),X_train,X_test,y_train,y_test,f'{model[0]}-model')

[34m[1mwandb[0m: Currently logged in as: [33mranuga-d[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'