In [1]:
import random
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import torch,torchvision
from torch.nn import *
from torch.optim import *
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    OneHotEncoder,
    LabelEncoder,
    Normalizer,
)
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,VotingRegressor,BaggingRegressor,RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score,precision_score,f1_score,recall_score
import pickle
import wandb
from sklearn.compose import make_column_transformer
from flaml import AutoML
from sklearn.model_selection import GridSearchCV
PROJECT_NAME = 'House-Prices-Advanced-Regression-Techniques-V7'
device = 'cuda'
np.random.seed(21)
random.seed(21)
torch.manual_seed(21)

<torch._C.Generator at 0x7f5d784ab890>

In [2]:
def make_submission(model,name):
    data = pd.read_csv('./data/test.csv')
    ids = data['Id']
    preds = model.predict(data)
    df = pd.DataFrame({'Id':ids,'SalePrice':preds})
    df.to_csv(f'./submisssions/{name}.csv',index=False)
    return df
def validate(model,X,y,val=False):
    preds = model.predict(X)
    if val:
        result = {
            'MAE':mean_absolute_error(preds,y),
            'MSE':mean_squared_error(preds,y),
            'Accuracy':accuracy_score(preds,y),
        }
    else:
        result = {
            'Val MAE':mean_absolute_error(preds,y),
            'Val MSE':mean_squared_error(preds,y),
            'Val Accuracy':accuracy_score(preds,y),
        }
    return result
def train(model,X_train,X_test,y_train,y_test,name):
    wandb.init(project=PROJECT_NAME,name=name)
    try:
        model.fit(X_train,y_train)
    except:
        pass
    wandb.log(validate(model,X_train,y_train))
    wandb.log(validate(model,X_test,y_test,True))
    pickle.dump(model,open(f'./models/model-{name}.pkl','wb'))
    make_submission(model,name)

In [3]:
data = pd.read_csv('./data/train.csv')

In [4]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
one_hot_encoding = []

In [9]:
for col_name,dtype in zip(data.columns,data.dtypes):
    if dtype == 'object':
        one_hot_encoding.append(col_name)

In [11]:
len(one_hot_encoding)

43

In [15]:
one_hot_encoding_data = data[one_hot_encoding]

In [25]:
def object_to_int(data,col):
    data_col = data[col].to_dict()
    idx = -1
    labels_and_int_index = {}
    for data_col_vals in data_col.values():
        if data_col_vals not in labels_and_int_index.keys():
            idx += 1
            labels_and_int_index[data_col_vals] = idx
    new_data = []
    for data_col_vals in data_col.values():
        new_data.append(labels_and_int_index[data_col_vals])
    data[col] = new_data
    return data,idx,labels_and_int_index,new_data

In [27]:
labels_and_int_indexs = {}

In [28]:
for one_hot_encoding_col in one_hot_encoding:
    data,idx,labels_and_int_index,new_data = object_to_int(data,one_hot_encoding_col)
    labels_and_int_index[one_hot_encoding_col] = labels_and_int_index

In [34]:
for num_of_samples,col,dtype in zip(data.isna().sum(),data.columns,data.dtypes):
    if dtype == 'object':
        pass
    else:
        if num_of_samples > 0:
            data[col].fillna(data[col].median())

In [36]:
data.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [37]:
for num_of_samples,col,dtype in zip(data.isna().sum(),data.columns,data.dtypes):
    print(num_of_samples)

0
0
0
259
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
8
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
81
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
