In [1]:


#Import data manipulation libaries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



#### First data loading strategy

In [2]:
train_import = pd.read_csv("/kaggle/input/playground-series-s3e6/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s3e6/test.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s3e6/sample_submission.csv")

#### Second data loading strategy
Loading another training data set
Adding a flag for the adversial cross-validation

In [3]:
test = pd.read_csv('/kaggle/input/playground-series-s3e6/test.csv')
test.drop("id",axis=1,inplace=True)
test['adv_val'] =  0

train = pd.read_csv('/kaggle/input/playground-series-s3e6/train.csv')
train.drop("id",axis=1,inplace=True)
train['adv_val'] =  1

original = pd.read_csv('/kaggle/input/paris-housing-price-prediction/ParisHousing.csv')
original['adv_val'] =  2

sample = pd.read_csv('/kaggle/input/playground-series-s3e6/sample_submission.csv')

#### Feature Engineering

In [4]:
#Define the target label

target='price'

#Create a dict - Why?
sets={'train':train,'test':test,'original':original}

In [5]:
#Function to drop rows that are duplicated in the train set and the original deck

def dropping_duplicates(data):
    data.drop_duplicates(inplace = True)
    
for n in {'train':train,'original':original}:
    print(f"\033[0;33;40m A number of duplicated rows in {n} is {sets[n].duplicated().sum()}, they were dropped \033[0;30;0m")
    dropping_duplicates(sets[n])

[0;33;40m A number of duplicated rows in train is 0, they were dropped [0;30;0m
[0;33;40m A number of duplicated rows in original is 0, they were dropped [0;30;0m


In [6]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

In [7]:
########################
### General Settings ###
########################

gpu_switch = 'ON'

###############################
### RepeatedStratifiedKFold ###
###############################

n_splits = 5
n_repeats =10
sm=SMOTE(sampling_strategy='minority')
#cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats = n_repeats, random_state=2023)
cv = RepeatedKFold(n_splits=n_splits, n_repeats = n_repeats, random_state=2023)
#cv = GroupKFold(n_splits=n_splits)

########################
### Define Weights   ###
########################

# Understand where and why the weights are used
weights = {0: 0.5009553158705701, 1: 262.19354838709677}

#### Define lists for feature engineering

In [8]:
num_cols = ['squareMeters','numberOfRooms','floors','cityPartRange','numPrevOwners',
            'made','basement','attic','garage','hasGuestRoom','cityCode'
            ]
cat_cols = ['hasYard','hasPool','isNewBuilt','hasStormProtector','hasStorageRoom',]
drop_col = []

#### Drop list from the test, train and original data set

In [9]:
#Drop the features in the list that have been define above
train.drop(drop_col,axis=1,inplace=True)
test.drop(drop_col,axis=1,inplace=True)
original.drop(drop_col,axis=1,inplace=True)

In [10]:
#Version 1
num_imp = SimpleImputer(strategy='mean')
cat_imp = SimpleImputer(strategy='most_frequent')
ohe = OneHotEncoder(handle_unknown='ignore',sparse = False,drop="first")

train[num_cols] = pd.DataFrame(num_imp.fit_transform(train[num_cols]),columns=num_cols)
train[cat_cols] = pd.DataFrame(cat_imp.fit_transform(train[cat_cols]),columns=cat_cols)
train_temp=pd.DataFrame(ohe.fit_transform(train[cat_cols]),columns=ohe.get_feature_names_out())
train=pd.concat([train.drop(cat_cols,axis=1),train_temp],axis=1) 

original[num_cols] = pd.DataFrame(num_imp.transform(original[num_cols]),columns=num_cols)
original[cat_cols] = pd.DataFrame(cat_imp.transform(original[cat_cols]),columns=cat_cols)
original_temp=pd.DataFrame(ohe.transform(original[cat_cols]),columns=ohe.get_feature_names_out())
original=pd.concat([original.drop(cat_cols,axis=1),original_temp],axis=1)    

test[num_cols] = pd.DataFrame(num_imp.transform(test[num_cols]),columns=num_cols)
test[cat_cols] = pd.DataFrame(cat_imp.transform(test[cat_cols]),columns=cat_cols)
test_temp=pd.DataFrame(ohe.transform(test[cat_cols]),columns=ohe.get_feature_names_out())
test=pd.concat([test.drop(cat_cols,axis=1),test_temp],axis=1) 

In [11]:
#Add improved version with Sklearn pipeline

In [12]:
#Combine train and priginbal data after the adversarial cross-validation
train=pd.concat([train,original],ignore_index=True)

In [13]:
#Drop the adversarial column
train=train.drop('adv_val',axis=1)
test=test.drop('adv_val',axis=1)
original=original.drop('adv_val',axis=1)

#### Define the features and target labels

In [14]:
# Drop the target from the dataframe to create the features
X = train.drop([target],axis=1)
#Create a series with the targets
y = train[target]
#Shuffle the data
X,y=shuffle(X,y,random_state=2023)
#Reset the index for the feature dataframe and the target labels
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [15]:
###########################
### additional dropping ###
###########################

#Drop outliers by row/sample/id
indexies =X[X['squareMeters']>99999].index
X=X.drop(indexies).reset_index(drop=True) #square meters
y=y.drop(indexies).reset_index(drop=True) #square meters

In [16]:
X['floors']=X['floors'].apply(lambda x: x if x<1000 else train['floors'].mean())
X['made']=X['made'].apply(lambda x: x if x<2022 else train['made'].mean() )
X['basement']=X['basement'].apply(lambda x: x if x<=10000 else train['basement'].mean() )
test['basement']=test['basement'].apply(lambda x: x if x<=10000 else train['basement'].mean() )
X['attic']=X['attic'].apply(lambda x: x if x<=10000 else train['attic'].mean() )
test['attic']=test['attic'].apply(lambda x: x if x<=10000 else train['attic'].mean() )
X['garage']=X['garage'].apply(lambda x: x if x<=1000 else train['garage'].mean() )
X['cityCode']=X['cityCode'].apply(lambda x: x if x<=100000 else int(train['cityCode'].mean()) )
test['cityCode']=test['cityCode'].apply(lambda x: x if x<=100000 else int(train['cityCode'].mean()) )
X['points']=X['hasYard_1']+X['hasPool_1']+X['isNewBuilt_1']+X['hasStormProtector_1']+X['hasStorageRoom_1']
X.drop(['hasYard_1','hasPool_1','isNewBuilt_1','hasStormProtector_1','hasStorageRoom_1'],axis=1,inplace=True)
test['points']=test['hasYard_1']+test['hasPool_1']+test['isNewBuilt_1']+test['hasStormProtector_1']+test['hasStorageRoom_1']
test.drop(['hasYard_1','hasPool_1','isNewBuilt_1','hasStormProtector_1','hasStorageRoom_1'],axis=1,inplace=True)

In [17]:
X=X.reset_index(drop=True)
y=y.reset_index(drop=True)

In [18]:
def StaSca_transform(X,test):
    StaSca = StandardScaler()
    X[num_cols] = pd.DataFrame(data = StaSca.fit_transform(X[num_cols]),columns = X[num_cols].columns)
    test[num_cols] = pd.DataFrame(data = StaSca.transform(test[num_cols]),columns = test[num_cols].columns)

In [19]:
def Box_transform(X,test):
    box_cols = ['fixed acidity','volatile acidity','residual sugar',
                 'chlorides','free sulfur dioxide','total sulfur dioxide',
                 'sulphates','alcohol']

    for column in box_cols: 
        X_temp,fitted_lambda = stats.boxcox(X[column]) 
        X[column]=X_temp 
        test_temp = stats.boxcox(test[column],fitted_lambda) 
        test[column]=test_temp

In [20]:
def perm_imp(model,data,target):
    X = data.to_numpy().copy()
    y = target.to_numpy().copy()
    permute = PermutationImportance(model,random_state=2023,n_iter =2,cv=10,scoring='neg_root_mean_squared_error').fit(X, y)
    eli5.show_weights(permute, feature_names = data.columns.tolist(),top=50)
    values = dict(zip(list(data.columns),list(permute.feature_importances_)))
    sorted_dict = {}
    sorted_keys = sorted(values, key=values.get)
    for w in sorted_keys:
        sorted_dict[w] = np.round(values[w],3)
    return sorted_dict

# Modelling

In [21]:
#Load the necessary packages
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
import base64
import seaborn as sns
import matplotlib.pyplot as plt
import os
import random
import gc

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import minmax_scaling
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from scipy import stats
import optuna
import xgboost as xgb
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,AdaBoostClassifier, GradientBoostingClassifier, 
                              ExtraTreesClassifier, VotingClassifier,ExtraTreesRegressor,AdaBoostRegressor,GradientBoostingRegressor)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils import class_weight
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
from sklearn.svm import SVR
from sklearn import datasets, linear_model
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.signal import argrelmin
from scipy.stats import skew
from scipy import stats


pd.set_option('display.max_columns', None)   

from sklearn.model_selection import train_test_split

import eli5
from eli5.sklearn import PermutationImportance
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense,BatchNormalization,Dropout
from tensorflow.keras import utils
import tensorflow_addons as tfa
import keras_tuner
from kerastuner.tuners import RandomSearch, Hyperband, BayesianOptimization
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
from kerastuner import HyperParameters, Objective

