In [1]:


#Import data manipulation libaries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



In [2]:
#Plotting
import base64
import seaborn as sns
import matplotlib.pyplot as plt
import os
import random
import gc

#### First data loading strategy

In [3]:
train_import = pd.read_csv("/kaggle/input/playground-series-s3e11/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s3e11/test.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s3e11/sample_submission.csv")

#### Second data loading strategy
Loading another training data set
Adding a flag for the adversial cross-validation

In [4]:
test = pd.read_csv('/kaggle/input/playground-series-s3e6/test.csv')
test.drop("id",axis=1,inplace=True)
test['adv_val'] =  0

train = pd.read_csv('/kaggle/input/playground-series-s3e6/train.csv')
train.drop("id",axis=1,inplace=True)
train['adv_val'] =  1

original = pd.read_csv('/kaggle/input/paris-housing-price-prediction/ParisHousing.csv')
original['adv_val'] =  2

sample = pd.read_csv('/kaggle/input/playground-series-s3e6/sample_submission.csv')

In [5]:
sets={'train':train,'test':test,'original':original}

In [6]:
#Define the target label
target='price'

In [7]:
original.head(2)

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,adv_val
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5,2
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5,2


### Feature Distribution - Univariate

In [8]:
#########################################################################
### The function for data distribution analysis between two datasets  ###
#########################################################################

def viz_comp (data1,data2,title):
    n_bins = 50
    histplot_hyperparams = {
        'kde':True,
        'alpha':0.4,
        'stat':'percent',
        'bins':n_bins
    }
    cols=num_cols
    fig, ax = plt.subplots(len(cols)//3+1,3, figsize=(20, 20))
    ax = ax.flatten()

    for i, column in enumerate(cols):
        sns.histplot(
            data1[column], label='Train',
            ax=ax[i], color='green', **histplot_hyperparams)

    for i, column in enumerate(cols):
        sns.histplot(
            data2[column], label='Train',
            ax=ax[i], color='red', **histplot_hyperparams)
    ax[0].set_title(title, fontstyle='normal',size=25)

In [9]:
def miss_values_check(data,n):
    print(f'\033[0;33;40m A number of NaN values in {n} is {data.isnull().sum().sum()} \033[0;30;0m')
    if data.isnull().sum().sum() >0:
        sns.heatmap(data.isnull())
for n in sets:
    miss_values_check(sets[n],n)

[0;33;40m A number of NaN values in train is 0 [0;30;0m
[0;33;40m A number of NaN values in test is 0 [0;30;0m
[0;33;40m A number of NaN values in original is 0 [0;30;0m


In [10]:
def create_EDA_summary (df = None):
    train_import_table = pd.DataFrame()
    train_import_table["dtype"] = df.apply(lambda x: x.dtype)
    train_import_table["NROW"] = df.shape[0]
    train_import_table["Unique_values"] = [ len(df[col].unique()) for col in df.columns]
    train_import_table["Percent_missing"] = (df.isnull().sum()/df.shape[0])*100
    
    return train_import_table

In [11]:
create_EDA_summary(train)

Unnamed: 0,dtype,NROW,Unique_values,Percent_missing
squareMeters,int64,22730,7319,0.0
numberOfRooms,int64,22730,100,0.0
hasYard,int64,22730,2,0.0
hasPool,int64,22730,2,0.0
floors,int64,22730,101,0.0
cityCode,int64,22730,7810,0.0
cityPartRange,int64,22730,10,0.0
numPrevOwners,int64,22730,10,0.0
made,int64,22730,33,0.0
isNewBuilt,int64,22730,2,0.0


#### Adversarial cross-validation

In [17]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score

In [18]:
from sklearn.utils import shuffle

def adv_validation (data_1,data_2,key):
    X_temp=shuffle(pd.concat([data_1,data_2],ignore_index=True))
    if key == True:
        adv_list = ['adv_val']
    else:
        adv_list = ['adv_val',target]
        
    X_full=X_temp.drop(adv_list,axis=1)
    y_full=X_temp.adv_val
    X_full,y_full=shuffle(X_full,y_full)
    X_full=X_full.reset_index(drop=True)
    y_full=y_full.reset_index(drop=True)
    result = cross_val_score(lgb.LGBMClassifier(n_estimators=200),X_full,y_full,scoring='roc_auc',cv=5).mean()
    return result

In [19]:
sets_adv_val={'train':train,'original':original}
for one in sets_adv_val:
    print (f'\033[0;33;40m Result between {one} and test = {adv_validation(sets_adv_val[one],test,False)} \033[0;30;0m')

[0;33;40m Result between train and test = 0.5070602227775023 [0;30;0m
[0;33;40m Result between original and test = 0.8336274511674195 [0;30;0m


#### Permutation Importance

In [20]:
import eli5
from eli5.sklearn import PermutationImportance
##############################
### Permutation importance ###
##############################

def permutation_importance(model,X,y):
    permute = PermutationImportance(model,random_state=2023,n_iter =10,cv=5).fit(X, y)
    eli5.show_weights(permute, feature_names = X.columns.tolist(),top=50)
    values = dict(zip(list(train.columns),list(permute.feature_importances_)))
    sorted_dict = {}
    sorted_keys = sorted(values, key=values.get)
    for w in sorted_keys:
        sorted_dict[w] = values[w]

In [21]:
model = lgb.LGBMClassifier(n_estimators=100)
X=shuffle(pd.concat([original,test],ignore_index=True).drop(['adv_val',target],axis=1),random_state=2023).reset_index(drop=True)
y=shuffle(pd.concat([original,test],ignore_index=True).adv_val,random_state=2023).reset_index(drop=True)
metric='roc_auc'
permute = PermutationImportance(model,random_state=2023,n_iter =10,cv=5,scoring = metric).fit(X, y)
eli5.show_weights(permute, feature_names = X.columns.tolist(),top=50)

Weight,Feature
0.1586  ± 0.0118,made
0.0593  ± 0.0060,numberOfRooms
0.0437  ± 0.0061,floors
0.0304  ± 0.0049,squareMeters
0.0249  ± 0.0067,garage
0.0176  ± 0.0040,basement
0.0065  ± 0.0019,hasGuestRoom
0.0064  ± 0.0026,attic
0.0032  ± 0.0028,numPrevOwners
0.0026  ± 0.0018,cityPartRange
