In [1]:
###### Basic packages
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA, NMF
from sklearn.manifold import TSNE
from umap import UMAP
from scipy.cluster.hierarchy import dendrogram, ward

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)



In [2]:
# File path and files

#File Path
data_dir = "/kaggle/input/playground-series-s3e15"
train_file = "data.csv"
#test_file = "test.csv"
orig_file = "/kaggle/input/predicting-heat-flux/Data_CHF_Zhao_2020_ATE.csv"
submission_file = "sample_submission.csv"

#target = "x_e_out [-]"

In [3]:
# test_data = Path(data_dir)/test_file
# train_data = Path(data_dir)/train_file
# submission_data = Path(data_dir)/submission_file

In [4]:
#train = pd.read_csv("/kaggle/input/playground-series-s3e15/data.csv")
# test = pd.read_csv(test_data)
# submission_df = pd.read_csv(submission_data)

In [5]:
def get_data(data_dir = data_dir,train_file = train_file,  submission_file = submission_file):
    #test_data = Path(data_dir)/test_file
    train_data = Path(data_dir)/train_file
    orig_data = Path(orig_file)
    submission_data = Path(data_dir)/submission_file
    train = pd.read_csv(train_data)
   # test = pd.read_csv(test_data)
    orig_train = pd.read_csv(orig_data)
    submission_df = pd.read_csv(submission_data)
    return train,submission_df,orig_train
    

In [6]:
train, submission_df, orig_train = get_data()

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31644 entries, 0 to 31643
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   31644 non-null  int64  
 1   author               26620 non-null  object 
 2   geometry             26144 non-null  object 
 3   pressure [MPa]       27192 non-null  float64
 4   mass_flux [kg/m2-s]  26853 non-null  float64
 5   x_e_out [-]          21229 non-null  float64
 6   D_e [mm]             26156 non-null  float64
 7   D_h [mm]             27055 non-null  float64
 8   length [mm]          26885 non-null  float64
 9   chf_exp [MW/m2]      31644 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 2.4+ MB


In [8]:
def create_EDA_summary (df = None):
    train_import_table = pd.DataFrame()
    train_import_table["dtype"] = df.apply(lambda x: x.dtype)
    train_import_table["NROW"] = df.shape[0]
    train_import_table["Unique_values"] = [ len(df[col].unique()) for col in df.columns]
    train_import_table["Percent_missing"] = (df.isnull().sum()/df.shape[0])*100
    
    return train_import_table

In [9]:
create_EDA_summary(df = train)

Unnamed: 0,dtype,NROW,Unique_values,Percent_missing
id,int64,31644,31644,0.0
author,object,31644,11,15.876627
geometry,object,31644,4,17.380862
pressure [MPa],float64,31644,145,14.069018
mass_flux [kg/m2-s],float64,31644,734,15.140311
x_e_out [-],float64,31644,1683,32.913032
D_e [mm],float64,31644,44,17.34294
D_h [mm],float64,31644,50,14.501959
length [mm],float64,31644,71,15.039186
chf_exp [MW/m2],float64,31644,109,0.0


In [10]:
train.head()

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6
1,1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8


In [11]:
train.rename(columns = {'x_e_out [-]':'target'}, inplace = True)

In [12]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,31644.0,15821.5,9134.980296,0.0,7910.75,15821.5,23732.25,31643.0
pressure [MPa],27192.0,10.640747,4.333683,0.1,6.89,11.07,13.79,20.68
mass_flux [kg/m2-s],26853.0,3068.011023,1777.03208,0.0,1519.0,2731.0,4069.0,7975.0
target,21229.0,-0.000453,0.100911,-0.8667,-0.0466,0.0038,0.0648,0.232
D_e [mm],26156.0,8.629255,5.185692,1.0,5.0,7.8,10.8,37.5
D_h [mm],27055.0,14.17433,19.838489,1.0,5.6,10.0,11.5,120.0
length [mm],26885.0,832.987391,672.299239,10.0,318.0,610.0,914.0,3048.0
chf_exp [MW/m2],31644.0,3.796985,1.983991,0.8,2.4,3.4,4.6,19.3


In [13]:
#Create the test set to be predicted - Consists of all the missing values in the target variable
train["Train/Test"] = np.where(train["target"].isnull() == True, "Test", "Train")

In [14]:
train["Train/Test"].value_counts(normalize=True)

Train    0.67087
Test     0.32913
Name: Train/Test, dtype: float64

In [15]:
test = train[train["Train/Test"] == "Test"]
train = train[train["Train/Test"] == "Train"]

In [16]:
train.shape


(21229, 11)

In [17]:
#Building a basic first model

#Mean of the target variable

train = train.set_index("id")
train.head(3)

Unnamed: 0_level_0,author,geometry,pressure [MPa],mass_flux [kg/m2-s],target,D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2],Train/Test
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6,Train
1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2,Train
2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5,Train


In [18]:
#Start with defining the initial imputation strategy for each feature

In [19]:
train["author"].value_counts()
#use most frequent for author

Thompson        11621
Janssen          1846
Weatherhead      1377
Beus             1087
Peskov            729
Williams          567
Richenderfer      371
Mortimore         130
Kossolapov         70
Inasaka            28
Name: author, dtype: int64

In [20]:
train["geometry"].value_counts()
#Use most frequent for geometry

tube       14121
annulus     2971
plate        424
Name: geometry, dtype: int64

In [21]:
train.columns.to_list()

['author',
 'geometry',
 'pressure [MPa]',
 'mass_flux [kg/m2-s]',
 'target',
 'D_e [mm]',
 'D_h [mm]',
 'length [mm]',
 'chf_exp [MW/m2]',
 'Train/Test']

In [22]:
#List for encoding

most_freq = ["author", 'geometry']

num_features = [
 'pressure [MPa]',
 'mass_flux [kg/m2-s]',
 'target',
 'D_e [mm]',
 'D_h [mm]',
 'length [mm]',
 'chf_exp [MW/m2]']