In [None]:
###### Basic packages
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA, NMF
from sklearn.manifold import TSNE
from umap import UMAP
from scipy.cluster.hierarchy import dendrogram, ward
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)

In [None]:
# File path and files

#File Path
data_dir = "/kaggle/input/playground-series-s3e15"
train_file = "data.csv"
#test_file = "test.csv"
orig_file = "/kaggle/input/predicting-heat-flux/Data_CHF_Zhao_2020_ATE.csv"
submission_file = "sample_submission.csv"

In [None]:
# test_data = Path(data_dir)/test_file
# train_data = Path(data_dir)/train_file
# submission_data = Path(data_dir)/submission_file

In [None]:
#train = pd.read_csv("/kaggle/input/playground-series-s3e15/data.csv")
# test = pd.read_csv(test_data)
# submission_df = pd.read_csv(submission_data)

In [None]:
def get_data(data_dir = data_dir,train_file = train_file,  test_file = test_file, submission_file = submission_file):
    #test_data = Path(data_dir)/test_file
    train_data = Path(data_dir)/train_file
    orig_data = Path(orig_file)
    submission_data = Path(data_dir)/submission_file
    train = pd.read_csv(train_data)
   # test = pd.read_csv(test_data)
    orig_train = pd.read_csv(orig_data)
    submission_df = pd.read_csv(submission_data)
    return train,submission_df,orig_train
    

In [None]:
train, submission_df, orig_train = get_data()

In [None]:
train.info()

In [None]:
def create_EDA_summary (df = None):
    train_import_table = pd.DataFrame()
    train_import_table["dtype"] = df.apply(lambda x: x.dtype)
    train_import_table["NROW"] = df.shape[0]
    train_import_table["Unique_values"] = [ len(df[col].unique()) for col in df.columns]
    train_import_table["Percent_missing"] = (df.isnull().sum()/df.shape[0])*100
    
    return train_import_table

In [None]:
create_EDA_summary(df = train)

In [None]:
train.head()

In [None]:
train.shape[1]

In [None]:
train.describe().T

In [None]:
def adversarial_validation(label = 'Train-Test' , target = "x_e_out [-]"):

    adv_train = train.drop(target, axis = 1)
    if label == 'Train-Test':
        adv_test = test.copy()
    else:
        adv_train = pd.concat([adv_train, test], ignore_index = True)
        adv_test = orig_train.drop(target, axis = 1)

    adv_train['is_test'] = 0
    adv_test['is_test'] = 1

    adv = pd.concat([adv_train, adv_test], ignore_index = True)

    adv_shuffled = adv.sample(frac = 1)

    adv_X = adv_shuffled.drop('is_test', axis = 1)
    adv_y = adv_shuffled.is_test

    skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

    val_scores = []
    predictions = np.zeros(len(adv))

    for fold, (train_idx, val_idx) in enumerate(skf.split(adv_X, adv_y)):
    
        adv_lr = LogisticRegression(random_state = 42)    
        adv_lr.fit(adv_X.iloc[train_idx], adv_y.iloc[train_idx])
        
        val_preds = adv_lr.predict_proba(adv_X.iloc[val_idx])[:,1]
        predictions[val_idx] = val_preds
        val_score = roc_auc_score(adv_y.iloc[val_idx], val_preds)
        val_scores.append(val_score)
    
    fpr, tpr, _ = roc_curve(adv['is_test'], predictions)
    plt.figure(figsize = (10, 10), dpi = 300)
    sns.lineplot(x=[0, 1], y=[0, 1], linestyle="--", label="Indistinguishable Datasets")
    sns.lineplot(x=fpr, y=tpr, label="Adversarial Validation Classifier")
    plt.title(f'{label} Validation = {np.mean(val_scores):.5f}', weight = 'bold', size = 17)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

In [None]:
adversarial_validation()

In [None]:
# Understand the distribution of the target
fig, ax = plt.subplots(1, 1, figsize = (16, 5))
#ax = ax.flatten() # Generate a numpy array with the length of the number of features to be plotted
sns.histplot(data = train, x = "yield", ax = ax)
ax.yaxis.label.set_size(20)
plt.yticks(fontsize = 12)
ax.set_xlabel('Count', fontsize = 20)
ax.set_ylabel(None)
plt.xticks(fontsize = 12)
plt.show()

In [None]:
def heatmap(dataset, label = None):
    corr = dataset.corr()
    plt.figure(figsize = (14, 10), dpi = 300)
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr, mask = mask, cmap = "BuPu", annot=True, annot_kws={"size": 4, "weight": "bold"})
    plt.yticks(fontsize = 5)
    plt.xticks(fontsize = 5)
    plt.title(f'{label} Dataset Correlation Matrix\n', fontsize = 25, weight = 'bold')
    plt.show()

In [None]:
heatmap(train, "Train")

In [None]:
def plot_kde_plots(df = train,test = test.columns.to_list(), num_cols = 4):
    num_rows = (len(test) - 1) // num_cols
    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(18, 4*num_rows))
    sns.set(font_scale=1.2, style='whitegrid')

    for i, col_name in enumerate(test):
        #if (col_name != 'is_generated') or (col_name != target_col):
        ax = axes[(i-1) // num_cols, (i-1) % num_cols]
        sns.kdeplot(data=df, x=col_name, ax=ax)
        ax.set_title(f'{col_name.title()}', fontsize=18)
        ax.set_xlabel(col_name.title(), fontsize=14)
        ax.tick_params(axis='both', which='major', labelsize=12)

    plt.tight_layout()
    plt.show()

In [None]:
plot_kde_plots()

In [None]:
def plot_countplots(df = train,test = test.columns.to_list(), num_cols = 4):
    num_rows = (len(test) - 1) // num_cols
    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(18, 4*num_rows))
    sns.set(font_scale=1.2, style='whitegrid')

    for i, col_name in enumerate(test):
        #if (col_name != 'is_generated') or (col_name != target_col):
        ax = axes[(i-1) // num_cols, (i-1) % num_cols]
        sns.countplot(data=df, x=col_name, ax=ax)
        ax.set_title(f'{col_name.title()}', fontsize=18)
        ax.set_xlabel(col_name.title(), fontsize=14)
        ax.tick_params(axis='both', which='major', labelsize=12)

    plt.tight_layout()
    plt.show()

In [None]:
plot_countplots()

In [None]:
cat_columns = test.iloc[:,:-3].columns.to_list()
num_colums = test.iloc[:,-3:].columns.to_list()

In [None]:
# Plot the distribution of the target vs categorical columns