# <div style="padding:20px;color:white;margin:0;font-size:200%;text-align:center;display:fill;border-radius:5px;background-color:#AF601A;overflow:hidden;font-weight:500">TPS June 2022
</div>




 ### If you are a beginner, see my other notebook for imputation tutorial [notebook](https://www.kaggle.com/code/abdulravoofshaik/quick-eda-and-missing-values-tutorial). 
 ### The following cartoon depicts the overall framework for applying advanced regression technique for imputation.


<img src="https://i.postimg.cc/90BdtTjq/imputer.gif">



<div style="color:white;display:fill;border-radius:8px;
            background-color:#E59866;font-size:150%;
            font-family:Nexa;letter-spacing:0.5px">
    <p style="padding: 8px;color:black;"><b>1.0 | Load data and Preprocessing</b></p>
</div>

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import missingno as msno
pd.set_option('display.max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")
Target = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv", index_col='row-col')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

data = reduce_mem_usage(data)
Target = reduce_mem_usage(Target)

In [None]:
# get the number of missing data points per column
missing_values_count = data.isnull().sum()
# look at the # of missing points in the first ten columns
missing_values_count[0:30]

In [None]:
# how many total missing values do we have?
total_cells = np.product(data.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
(total_missing/total_cells) * 100

In [None]:
import matplotlib.pyplot as plt #data viz
# v Sets matplotlib figure size defaults to 25x20
plt.rcParams["figure.figsize"] = (25,20)

fig, ax = plt.subplots(#This functions lets us place many plots within a single figure
    9, #number of rows
    9  #number of columns
)

#adds title to figure            
fig.text(
    0.35, # text position along x axis
    1, # text position along y axis
    'EDA of Features', #title text
    {'size': 24} #Increase font size to 35
         )

i = 0 # subplot column index
j = 0 # subplot row index
for col in data.columns: #iterate thru all dataset columns
    if col not in ['row_id']: 
        ax[j, i].hist(data[col], bins=100) #plots histogram on subplot [j, i]
        ax[j, i].set_title(col, #adds a title to the subplot
                           {'size': '14', 'weight': 'bold'}) 
        if i == 8: #if we reach the last column of the row, drop down a row and reset
            i = 0
            j += 1
        else: #if not at the end of the row, move over a column
            i += 1

plt.rcParams.update({'axes.facecolor':'lightgreen'})
plt.figure(facecolor='red') 
plt.show() 


<div style="color:white;display:fill;border-radius:8px;
            background-color:#E59866;font-size:150%;
            font-family:Nexa;letter-spacing:0.5px">
    <p style="padding: 8px;color:black;"><b>2.0 | Preprocessing</b></p>
</div>

### Lets start with column F_1_0 and try to replace the NaN values. First we need to find out where are the missing values located in this column. As shown below figure, we need to split our data into two sets. 
### Training set: It consists of known values for F_1_0 column, which means all the rows with non-NaN value in F_1_0 column. 
### Test set: It consists of Unknown values for F_1_0 column, which means all the rows with  NaN value in F_1_0 column.
### we apply the same concept to individual column and develop 80 individual models. Missing vlaue plot and correlation plots are shown below


In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import seaborn as sns
plt.figure(figsize=(20,8))
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='crest')

### As we can see above plot, there are not missing values for columns starting with F_2

In [None]:
## we apply the same concept to individual column and develop 80 individual models. As we have noticed earlier, dataset has four different subsets. Lets divide them.
# this code snippet is taken from https://www.kaggle.com/code/martynovandrey/tps-jun-22-splitted-dataset-24x-faster. Consider upvoting the original author also
features = list(data.columns)
features_1, features_2, features_3, features_4 = [], [], [], []
F = [[], [], [], [], []]
for feature in features:
    for i in [1, 2, 3, 4]:
        if feature.split('_')[1] == str(i):
            F[i].append(feature)
df = [[], [], [], [], []]

fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(18, 30))

for i in [1, 2, 3, 4]:
    df[i] = data[F[i]]
    corr = df[i].corr()
    sns.heatmap(corr, ax=axs[i-1], annot=True)

### Columns starting with F_4 are correlated with each other, we do not see such pattern in other subsets.

In [None]:
# lets take look at one of the subset
df[4].head(2)


In [None]:
# get the number of missing data points per column in this subset
missing_values_count = df[4].isnull().sum()
missing_values_count [0:30]

In [None]:
## Check the number of NaN values per sample
n_null = pd.DataFrame(df[4].isna().sum(axis=1))
plot = sns.histplot(data=n_null, bins=10, stat="percent")
plot.set_xlabel('The number of NaN values')
print(f'The max number of NaN values per sample is {n_null.max()}.')

### as we can see above ~22 rows has NaN value and 4% of rows has two NaN values. We need to find some special way to manage two NaN values in the same row. @ehekatlact has noticed that number of NaN values per rows is critical. One idea is to create new column which contain the number of rows and treat it as the categorical data. We will use this information in future.

## Feature interaction 
### To investiage the feature interaction, simple LGBM model is selected. 

In [None]:
# this part of code snipped is taken from https://www.kaggle.com/code/vishalbajaj2000/santander-lightgbm-xgb-feature-interactions 
# upvote the original work also
def get_splits_gain(tree_num=0, parent=-1, tree=None, lev=0, node_name=None, split_gain=None, reclimit=50000):
    '''
    Function to recusively walk thru a single decision tree (only LIGHTGBM for now) and extract GAIN values and Feature interactions. 
    Since it uses YIELD the user of the function needs to walk through the function in a for loop to extract values. 
    ---Arguments---
    tree_num : The number of the tree node to analyze used only in output.
    parent : DO NOT PASS A VALUE, it used by the function for recusion to keep track of the interactions.
    tree : A single decision tree as a DICT. Required.
    lev : DO NOT PASS A VALUE, it used by the function for recusion to keep track of the level of the node/interaction.
    node_name : DO NOT PASS A VALUE, it used by the function for recusion to keep track of the interactions.
    split_gain : DO NOT PASS A VALUE, it used by the function for recusion to keep track of the gain values.
    inter : DO NOT PASS A VALUE, it used by the function for recusion to keep track of the interactions.
    reclimit: this sets the max recusive limit higher incase the model is very deep. USe with caution, I have no idea on how the system beaves with very large values!
    
    ---YIELD---
    A single line per recursion:
    tree_num : tree number
    tag : 'split_feature', the tag/key for which the value is being extracted for the split.
    old_parent : The actual parent for the column that is splitted on, for the first node of the tree it is '-1' by default.
    parent : The child node under the old_parent. Note: for the first node the value is passed here
    lev : The depth/Level of the node, for the first node the level is 1.
    node_name : The node from where the info was extracted.
    split_gain : the gain value at that level
    '''
    sys.setrecursionlimit(reclimit)
    if tree == None:
        raise Exception('No tree present to analyze!')
    for k, v in tree.items():
        if type(v) != dict and k in ['split_feature']:
            old_parent = parent
            parent = v
            tag = k
            yield tree_num, tag, old_parent, parent, lev, node_name, split_gain
        elif isinstance(v, dict):
            if v.get('split_gain') == None:
                continue
            else:
                tree = v
                lev_inc = lev + 1
                node_name = k
                split_gain = v['split_gain']
                for result in get_splits_gain(tree_num, parent, tree, lev_inc, node_name, split_gain):
                    yield result
        else:
            continue
            
#Creates a feature dictionary based on the features present in the LGBM model
def lgbm_create_feat_dict(model):
    feat_dict = dict(enumerate(model['feature_names']))
    feat_dict[-1] = 'base'
    return feat_dict

def analyze_model(model):
    '''
    Take a JSON dump of LGBM model, calls the recursive function to analyse all trees in the model, interprets feature index/names and returns a dataframe with teh model analysis and a feature interactions
    ---Arguments---
    model :  LGBM JSON model
    ---Returns---
    tree_info_df : pandas DF with model summarized and feature interactions.
    '''
    tree_info = []
    for j in range(0,len(model['tree_info'])):
        for i in get_splits_gain(tree_num=j, tree=model['tree_info'][j]):
            tree_info.append(list(i))
    tree_info_df = pd.DataFrame(tree_info, columns=['TreeNo','Type','ParentFeature', 'SplitOnfeature','Level','TreePos','Gain'])
    lgbm_feat_dict = lgbm_create_feat_dict(model_lgb)
    tree_info_df['ParentFeature'].replace(lgbm_feat_dict, inplace=True)
    tree_info_df['SplitOnfeature'].replace(lgbm_feat_dict, inplace=True)
    tree_info_df['Interactions'] = tree_info_df['ParentFeature'].map(str) + ' - ' + tree_info_df['SplitOnfeature'].map(str)
    return tree_info_df


In [None]:
import sys
from sklearn.model_selection import train_test_split
TARGET=['F_4_11']  # vary the target to observe the feature interactions
X_train, X_val, y_train, y_val = train_test_split(df[4].drop(TARGET,axis=1), df[4][TARGET], test_size=0.3, shuffle=True)
from lightgbm import LGBMRegressor
lgbm_params = {'random_state': 22,
#           'device' : 'gpu',
          'n_estimators': 200, # you can increase the n_splits value to >10000, to minimize the runtime I have used 200
          'learning_rate' : 0.1,
          'metric' : 'rmse'}
lgbm_model = LGBMRegressor(**lgbm_params)
lgbm_model.fit(X_train,y_train,eval_set=[(X_val,y_val),(X_train,y_train)],verbose=10000)
#Produces a JSON model dump for LightGBM
model_lgb = lgbm_model.booster_.dump_model()

lgb_df = analyze_model(model_lgb)
lgb_df= round(lgb_df, 2)
lgb_df.head()

In [None]:
#Produce some calculations for easier plotting
lgb_inter_calc = lgb_df.groupby('Interactions')['Gain'].agg(['count','sum','min','max','mean','std']).sort_values(by='sum', ascending=False).reset_index('Interactions').fillna(0)
lgb_inter_calc = round(lgb_inter_calc, 2) #if i dont round sns.barplot fails due to too large a precision.
#Created 2 datasets as i see that BASE (the first node of the tree) has a very hight gains and thus dilutes the interactions
lgb_inter_calc_nobase = lgb_inter_calc[lgb_inter_calc['Interactions'].str.contains('base')==False]
lgb_inter_calc.head()

In [None]:
plot_data = lgb_inter_calc_nobase.sort_values('sum', ascending=False).iloc[0:75].reset_index(drop=True)
def plot_feat_interaction(data=None):
    plt.figure(figsize=(20, 14))
    ax = plt.subplot(121)
    sns.barplot(x='sum', y='Interactions', data=data.sort_values('sum', ascending=False), ax=ax)
    ax.set_title('Total Gain for Feature Interaction', fontweight='bold', fontsize=14)
    # Plot Gain importances
    ax = plt.subplot(122)
    sns.barplot(x='count', y='Interactions', data=data.sort_values('sum', ascending=False), ax=ax)
    ax.set_title('No. of times Feature interacted', fontweight='bold', fontsize=14)
    plt.tight_layout()
plot_feat_interaction(plot_data)

### Similar to @WTI200 a good feature interaction can be observed between different features. Need to find sophesticated approach to include this informaiton into the training purposes. For now we will ignore this information and build a model in the next section.

<div style="color:white;display:fill;border-radius:8px;
            background-color:#E59866;font-size:150%;
            font-family:Nexa;letter-spacing:0.5px">
    <p style="padding: 8px;color:black;"><b>3.0 | Modeling</b></p>
</div>

<div class="alert alert-block alert-info" style="font-size:14px; font-family:verdana;">
    📌 Important note: Based on the correlation plot it can be seen that subset-four (start with F_4_) has good correlation with each other. Rest of the subsets are weekly correlated.<br>
    The strategy is use LGBM regression for subset-4 and mean regression for rest of the subsets.
</div>

In [None]:
%%time
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
for i in [4]:
    dummy_df = pd.DataFrame()
    dummy_df2 = pd.DataFrame()
    col_train = pd.DataFrame()
    col_test = pd.DataFrame()
    dummy_df=df[i].copy()
    dummy_df2=df[i].copy()
    for column in dummy_df.columns: 
        print('Processing Colunm Name : ', column)
        if dummy_df[column].isnull().sum() == 0:
            print(dummy_df[column].isnull().sum())
            continue    # continue as no NaN values found in this column
        col_nan_ix = dummy_df[dummy_df[column].isnull()].index  # identify the rows which has NaN in column F_1_0
        col_train = dummy_df.drop(col_nan_ix, axis = 0)  #training set which has F_1_0 fixed value but other columns might have NaN values
        col_test = dummy_df[dummy_df.index.isin(col_nan_ix)] 
        X = col_train.drop([column],axis=1)
        y = col_train[column]
        model = LGBMRegressor(n_estimators=20000,metric='r2')
        model.fit(X,y)
        score=model.score(X, y)
        print('R2 of this column : ', score)
        dummy_df2[column][col_nan_ix] = model.predict(col_test.drop([column],axis=1))
    df[i]=dummy_df2.copy()

In [None]:
# lets take a quick look at the subset-4
df[4].head()

### Now we use simple meanimputer for subset-1 and subset-3. Note that subset-2 has no missing values

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(
        missing_values=np.nan,
        strategy='mean') 
for i in [1,3]:    
    df[i][:] = imp.fit_transform(df[i])

In [None]:
Merged_Subsets = pd.concat([df[1], df[2], df[3], df[4]], axis=1)
submission = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = Merged_Subsets.loc[row, col]

submission.to_csv('submission.csv')

## The LB is 0.87724, top 2% as of now.

<img src="https://31.media.tumblr.com/tumblr_lptgqw2SE81qj2mh7o1_500.gif">

<div style="color:white;display:fill;border-radius:8px;
            background-color:#E59866;font-size:150%;
            font-family:Nexa;letter-spacing:0.5px">
    <p style="padding: 8px;color:black;"><b>3.0 | References</b></p>
</div>

https://towardsdatascience.com/using-the-missingno-python-library-to-identify-and-visualise-missing-data-prior-to-machine-learning-34c8c5b5f009 <br>
https://towardsdatascience.com/how-to-handle-missing-data-8646b18db0d4 <br>
https://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python/notebook <br>
https://www.kaggle.com/code/residentmario/using-missingno-to-diagnose-data-sparsity/notebook <br>
https://www.analyticsvidhya.com/blog/2021/05/dealing-with-missing-values-in-python-a-complete-guide/ <br>
https://www.kaggle.com/code/calebreigada/getting-started-eda-preprocessing <br>
https://medium.com/swlh/impute-missing-values-the-right-way-c63735fccccd <br>

