In [None]:
!pip -q --disable-pip-version-check install mplcyberpunk

# <p style="background-color:#f3ab60;font-family:newtimeroman;color:#662e2e;font-size:130%;text-align:center;border-radius:40px 40px;">UBIQUANT MARKET PREDICTION</p>

<h1 align='center'>Introduction 📝</h1>
The aim of this competition is to predict obfuscated (unclear) metric which is relevant for making trading decisions from the features derived from real historic data from thousands of investments. This notebook will contain almost all the necessary steps and methods which will be helpful in the competition.

<h1 align='center'>Dataset Info 📈</h1>
<b>Columns of the train data-</b> 

* ```row_id``` - A unique identifier for the row.
* ```time_id``` - The ID code for the time the data was gathered. The time IDs are in order, but the real time between the time IDs is not constant and will likely be shorter for the final private test set than in the training set.
* ```investment_id``` - The ID code for an investment. Not all investment have data in all time IDs.
* ```target``` - The target.
* ```[f_0:f_299]``` - Anonymized features generated from market data.

<h1 align='center'>Evaluation Metric 📐</h1>
Submissions are evaluated on the mean of the Pearson correlation coefficient for each time ID.

<img src='https://user-images.githubusercontent.com/55939250/151697692-562f6439-170a-4869-856d-eaa11b2da5f5.jpg' width=500px>

where,<br> 
* r = Pearson Correlation Coefficient
* n = Number of samples
* x = First variable samples
* y = Second variable samples

# <p style="background-color:#f3ab60;font-family:newtimeroman;color:#662e2e;font-size:130%;text-align:center;border-radius:40px 40px;">TABLE OF CONTENTS</p>
<ul style="list-style-type:square">
    <li><a href="#1">Importing Libraries</a></li>
    <li><a href="#2">Reading the data</a></li>
    <li><a href="#3">Explore The Data Analysis</a></li>
    <ul style="list-style-type:disc">
        <li><a href="#3.1">Investment_ID Distribution</a></li>
        <li><a href="#3.2">Time_ID Distribution</a></li>
        <li><a href="#3.3">Target Distribution</a></li>
        <li><a href="#3.4">Time_id Categorized</a></li>
        <li><a href="#3.5">Features Distribution</a></li>
    </ul>
    <li><a href="#4">Baseline Model</a></li>
    <li><a href="#5">Submission</a></li>
</ul>



<a id='1'></a>
# <p style="background-color:#f3ab60;font-family:newtimeroman;color:#662e2e;font-size:130%;text-align:center;border-radius:40px 40px;">IMPORTING LIBRARIES</p>

In [None]:
import os
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import mplcyberpunk
plt.style.use('cyberpunk')

import warnings
warnings.filterwarnings('ignore')

<a id='2'></a>
# <p style="background-color:#f3ab60;font-family:newtimeroman;color:#662e2e;font-size:130%;text-align:center;border-radius:40px 40px;">READING THE DATA</p>

In [None]:
%%time
df = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

In [None]:
df.head()

In [None]:
df.info()

<a id='3'></a>
# <p style="background-color:#f3ab60;font-family:newtimeroman;color:#662e2e;font-size:130%;text-align:center;border-radius:40px 40px;">EXPLORE THE DATA</p>

<a id='3.1'></a>
# Investment_ID Distribution
### First of all we will look at the count of samples in each investment id. 

In [None]:
plt.figure(figsize=(18, 7))

df_temp = df.groupby("investment_id")['row_id'].count().reset_index().rename(columns={'row_id':'Sample_Count'})
sns.histplot(x=df_temp['investment_id'], bins=50)
plt.xlabel('Investment_id')
plt.ylabel('Count')
plt.title('Sample count of Investment_ID Distribution')

least_id = int(df_temp[df_temp['Sample_Count'] == df_temp['Sample_Count'].describe()['min']]['investment_id'])
max_id = int(df_temp[df_temp['Sample_Count'] == df_temp['Sample_Count'].describe()['max']]['investment_id'])

print(f"Number of unique investments - {len(df_temp)}")
print(f"Investment id with least number of samples - {least_id}, Count - {int(df_temp['Sample_Count'].describe()['min'])}")
print(f"Investment id with maximum number of samples - {max_id}, Count - {int(df_temp['Sample_Count'].describe()['max'])}")

plt.show()

<a id='3.2'></a>
# Time_ID Distribution
### Now let us look at the count of samples in each time id. 

In [None]:
fig, ax = plt.subplots(2,1,figsize=(17,10))

df_temp = df.iloc[:, 0:3]
df_temp2 = df_temp.groupby('time_id')['investment_id'].count().reset_index().rename(columns={'investment_id': 'Sample_Count'})

sns.distplot(x=df_temp2["Sample_Count"], ax=ax[0])
sns.scatterplot(x=df_temp2["time_id"], y=df_temp2["Sample_Count"], ax=ax[1])

ax[0].set_xlabel('Sample_Count')
ax[1].set_xlabel('Time_ID')

least_id = int(df_temp2[df_temp2['Sample_Count'] == df_temp2['Sample_Count'].describe()['min']]['time_id'])
max_id = int(df_temp2[df_temp2['Sample_Count'] == df_temp2['Sample_Count'].describe()['max']]['time_id'])

print(f"Number of unique time_ids - {len(df_temp2)}")
print(f"Time_id with least number of samples - {least_id}, Count - {int(df_temp2['Sample_Count'].describe()['min'])}")
print(f"Time_id with maximum number of samples - {max_id}, Count - {int(df_temp2['Sample_Count'].describe()['max'])}")

plt.show()

<a id='3.3'></a>
# Target Distribution
### Then let's analysis the target distribution.

In [None]:
plt.figure(figsize=(15,7))

sns.distplot(df['target'])
plt.title("Target distribution")

print("Mean of target - ", df['target'].describe()['mean'])
print("Minimum value of target - ", df['target'].describe()['min'])
print("Maximum value of target - ", df['target'].describe()['max'])

plt.show()

### The target is normally distributed with mean of -0.0210915. Let us also look the most skewed target distribution categorized by investment_id and time_id respectively.

In [None]:
fig, ax = plt.subplots(2,2,figsize=(20,10))

df_temp = df.iloc[:, 0:4]
df_temp1 = df_temp.groupby('investment_id').skew()['target'].reset_index()
id_1 = int(df_temp1[df_temp1['target'] == df_temp1['target'].describe()['max']]['investment_id']) 
id_2 = int(df_temp1[df_temp1['target'] == df_temp1['target'].describe()['min']]['investment_id'])

select_1 = df_temp[df_temp['investment_id'] == id_1][["target"]]
select_2 = df_temp[df_temp['investment_id'] == id_2][["target"]]
sns.distplot(select_1['target'], ax=ax[0, 0])
sns.distplot(select_2['target'], ax=ax[0, 1])
print(f"Investment_IDs with most skewed target are - {id_1}, {id_2}")
df_temp2 = df_temp.groupby('time_id').skew()['target'].reset_index()

id_3 = int(df_temp2[df_temp2['target'] == df_temp2['target'].describe()['max']]['time_id']) 
id_4 = int(df_temp2[df_temp2['target'] == df_temp2['target'].describe()['min']]['time_id'])

select_3 = df_temp[df_temp['time_id'] == id_3][["target"]]
select_4 = df_temp[df_temp['time_id'] == id_4][["target"]]
sns.distplot(select_3['target'], ax=ax[1, 0])
sns.distplot(select_4['target'], ax=ax[1, 1])
print(f"Time_IDs with most skewed target are - {id_3}, {id_4}")

plt.show()

<a id='3.4'></a>
# Time_id Categorized
### I decided to group time_id into 5 different categories according to their range, i.e., {'0-250', '251-500', '501-750', '751-1000', '1000+'} to know better about data with time. We will check different things like how many sample counts, missing values per category are there, target distribution across each time id range and later on feature distribution. 

In [None]:
def cate_time(time):
    if time in range(0, 251):
        x = '0-250'
    elif time in range(251, 501):
        x = '251-500'
    elif time in range(501, 751):
        x = '501-750'
    elif time in range(751, 1001):
        x = '751-1000'
    else:
        x = '1000+'
    return x

df_temp = df.iloc[:, 0:4]
df_temp2 = df_temp.groupby('time_id')['investment_id'].count().reset_index().rename(columns={'investment_id': 'Sample_Count'})
df_temp2['time_cat'] = df_temp2['time_id'].apply(lambda x : cate_time(x))

In [None]:
df_temp2.head()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 7))
fig.suptitle('Time Categorized Distribution', size = 20, weight='bold')

sizes = []
for x in df_temp2['time_cat'].unique():
    sizes.append(df_temp2[df_temp2['time_cat']==x]['Sample_Count'].sum())
labels = list(df_temp2['time_cat'].unique())
explode = (0.05, 0.05, 0.05, 0.05, 0.05)
colors = ['#FF2281', '#FF6600', '#13CA91', '#099FFF', '#CC00FF']
ax[0].pie(sizes, colors=colors, explode=explode, startangle=90, labels=labels,
        autopct='%1.0f%%', pctdistance=0.7,textprops={'fontsize':12}, counterclock=False)
centre_circle = plt.Circle((0,0),0.5,fc='#212946')
ax[0].add_artist(centre_circle)
ax[0].axis('equal')
ax[0].set_title("Sample Count Distribution", size = 15)

missing = {'0-250':0, '251-500':0, '501-750':0, '751-1000':0, '1000+':0}
prev = 0

for i in list(df_temp2['time_id'].values):
    if i-prev == 0:
        prev = i
    else:
        for j in range(int(prev+1), i):
            if j in range(0, 251):
                missing['0-250'] += 1
            elif j in range(251, 501):
                missing['251-500'] += 1
            elif j in range(501, 751):
                missing['501-750'] += 1
            elif j in range(751, 1001):
                missing['751-1000'] += 1
            else:
                missing['1000+'] += 1
        prev = i
sns.barplot(x=list(missing.keys()), y=list(missing.values()), ax=ax[1])
ax[1].set_title("Missing value count in each time category", size = 15)
plt.show()

plt.figure(figsize=(20, 10))
df_temp['time_cat'] = df_temp['time_id'].apply(lambda x : cate_time(x))
sns.boxplot(y=df_temp['target'], x=df_temp['time_cat'])
plt.title("Target Distribution in each time category", size = 15)
plt.show()

<a id='3.5'></a>
# Features Distribution
### Now we will focus on targets. There are total 300 anonymized features generated from market data. First we will look at distributions of few features

In [None]:
features = ['f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5']

fig, ax = plt.subplots(2,3,figsize=(20,10))

for i in range(2):
    for j in range(3):
        if i == 1:
            sns.violinplot(y=df[features[i*2+j+1]], ax=ax[i, j])
        else:
            sns.violinplot(y=df[features[i*2+j+1]], ax=ax[i, j])

plt.show()

In [None]:
features = ['f_100', 'f_101', 'f_102', 'f_103', 'f_104', 'f_105']

fig, ax = plt.subplots(2,3,figsize=(20,10))

for i in range(2):
    for j in range(3):
        if i == 1:
            sns.scatterplot(x=df[features[i*2+j+1]], y=df['target'], ax=ax[i, j])
        else:
            sns.scatterplot(x=df[features[i+j]], y=df['target'], ax=ax[i, j])

plt.show()

### Next we will calculate correlation of each feature with the target and will then plot the distribution of most correlated and least correlated features.

In [None]:
cor = {}

for i in range(0, 300):
    corr_f = df[['target', f'f_{i}']].corr().iloc[0,1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key=lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[-(i*2+j+1+1)]} Correlation is {list(cor.values())[-(i*2+j+1+1)]}')
            sns.scatterplot(x=df[list(cor.keys())[-(i*2+j+1+1)]], y=df['target'], ax=ax[i, j])
        else:
            print(f'target & {list(cor.keys())[-(i+j+1)]} Correlation is {list(cor.values())[-(i+j+1)]}')
            sns.scatterplot(x=df[list(cor.keys())[-(i+j+1)]], y=df['target'], ax=ax[i, j])
            
plt.suptitle("Distribution of most correlated features with target", fontsize=15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[i*2+j+1]} Correlation is {list(cor.values())[i*2+j+1]}')
            sns.scatterplot(x=df[list(cor.keys())[i*2+j+1]], y=df['target'], ax=ax[i, j])
        else:
            print(f'target & {list(cor.keys())[i+j]} Correlation is {list(cor.values())[i+j]}')
            sns.scatterplot(x=df[list(cor.keys())[i+j]], y=df['target'], ax=ax[i, j])
            
plt.suptitle("Distribution of least correlated features with target", fontsize=15)
plt.show()

### Now we will see which feature play more importance with respect to different time id range. This could be helpful in analysing more about features and how they are changing with time.

In [None]:
df_temp = df.iloc[:, 1:]
df_temp['time_cat'] = df_temp['time_id'].apply(lambda x : cate_time(x))

### Time_id range - 0-250

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat']=='0-250']
for i in range(0, 300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0,1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key=lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[-(i*2+j+1+1)]} Correlation is {list(cor.values())[-(i*2+j+1+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i*2+j+1+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[0])
        else:
            print(f'target & {list(cor.keys())[-(i+j+1)]} Correlation is {list(cor.values())[-(i+j+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i+j+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[0])
            
plt.suptitle("Distribution of most correlated features with target of time_id between 0-250", fontsize=15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[i*2+j+1]} Correlation is {list(cor.values())[i*2+j+1]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i*2+j+1]], y=df_temp2['target'], ax=ax[i, j], color=colors[0])
        else:
            print(f'target & {list(cor.keys())[i+j]} Correlation is {list(cor.values())[i+j]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i+j]], y=df_temp2['target'], ax=ax[i, j], color=colors[0])
            
plt.suptitle("Distribution of least correlated features with target of time_id between 0-250", fontsize=15)
plt.show()

### Time_id range - 251-500

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat']=='251-500']
for i in range(0, 300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0,1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key=lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[-(i*2+j+1+1)]} Correlation is {list(cor.values())[-(i*2+j+1+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i*2+j+1+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[1])
        else:
            print(f'target & {list(cor.keys())[-(i+j+1)]} Correlation is {list(cor.values())[-(i+j+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i+j+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[1])
            
plt.suptitle("Distribution of most correlated features with target of time_id between 251-500", fontsize=15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[i*2+j+1]} Correlation is {list(cor.values())[i*2+j+1]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i*2+j+1]], y=df_temp2['target'], ax=ax[i, j], color=colors[1])
        else:
            print(f'target & {list(cor.keys())[i+j]} Correlation is {list(cor.values())[i+j]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i+j]], y=df_temp2['target'], ax=ax[i, j], color=colors[1])
            
plt.suptitle("Distribution of least correlated features with target of time_id between 251-500", fontsize=15)
plt.show()

### Time_id range - 501-750

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat']=='501-750']
for i in range(0, 300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0,1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key=lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[-(i*2+j+1+1)]} Correlation is {list(cor.values())[-(i*2+j+1+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i*2+j+1+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[2])
        else:
            print(f'target & {list(cor.keys())[-(i+j+1)]} Correlation is {list(cor.values())[-(i+j+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i+j+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[2])
            
plt.suptitle("Distribution of most correlated features with target of time_id between 501-750", fontsize=15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[i*2+j+1]} Correlation is {list(cor.values())[i*2+j+1]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i*2+j+1]], y=df_temp2['target'], ax=ax[i, j], color=colors[2])
        else:
            print(f'target & {list(cor.keys())[i+j]} Correlation is {list(cor.values())[i+j]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i+j]], y=df_temp2['target'], ax=ax[i, j], color=colors[2])
            
plt.suptitle("Distribution of least correlated features with target of time_id between 501-750", fontsize=15)
plt.show()

### Time_id range - 751-1000

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat']=='751-1000']
for i in range(0, 300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0,1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key=lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[-(i*2+j+1+1)]} Correlation is {list(cor.values())[-(i*2+j+1+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i*2+j+1+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[3])
        else:
            print(f'target & {list(cor.keys())[-(i+j+1)]} Correlation is {list(cor.values())[-(i+j+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i+j+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[3])
            
plt.suptitle("Distribution of most correlated features with target of time_id between 751-1000", fontsize=15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[i*2+j+1]} Correlation is {list(cor.values())[i*2+j+1]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i*2+j+1]], y=df_temp2['target'], ax=ax[i, j], color=colors[3])
        else:
            print(f'target & {list(cor.keys())[i+j]} Correlation is {list(cor.values())[i+j]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i+j]], y=df_temp2['target'], ax=ax[i, j], color=colors[3])
            
plt.suptitle("Distribution of least correlated features with target of time_id between 751-1000", fontsize=15)
plt.show()

### Time_id range - 1000+

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat']=='1000+']
for i in range(0, 300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0,1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key=lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[-(i*2+j+1+1)]} Correlation is {list(cor.values())[-(i*2+j+1+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i*2+j+1+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[4])
        else:
            print(f'target & {list(cor.keys())[-(i+j+1)]} Correlation is {list(cor.values())[-(i+j+1)]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[-(i+j+1)]], y=df_temp2['target'], ax=ax[i, j], color=colors[4])
            
plt.suptitle("Distribution of most correlated features with target of time_id 1000+", fontsize=15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2,3,figsize=(20,10))
for i in range(2):
    for j in range(3):
        if i == 1:
            print(f'target & {list(cor.keys())[i*2+j+1]} Correlation is {list(cor.values())[i*2+j+1]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i*2+j+1]], y=df_temp2['target'], ax=ax[i, j], color=colors[4])
        else:
            print(f'target & {list(cor.keys())[i+j]} Correlation is {list(cor.values())[i+j]}')
            sns.scatterplot(x=df_temp2[list(cor.keys())[i+j]], y=df_temp2['target'], ax=ax[i, j], color=colors[4])
            
plt.suptitle("Distribution of least correlated features with target of time_id 1000+", fontsize=15)
plt.show()

<a id='4'></a>
# <p style="background-color:#f3ab60;font-family:newtimeroman;color:#662e2e;font-size:130%;text-align:center;border-radius:40px 40px;">BASELINE MODEL</p>

In [None]:
# Due to low memory
%reset -f

In [None]:
import os
import gc
import shap
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

sns.set(rc={"axes.facecolor":"#212946","figure.facecolor":"#ffffff"})

In [None]:
df = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

In [None]:
X = df.drop(['row_id', 'time_id', 'target'], axis=1)
y = df["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=10)

del df, X, y
gc.collect()

In [None]:
model = LGBMRegressor(
        num_leaves=6,
        learning_rate = 0.05,
        n_estimators = 1000,
        min_child_samples = 1000, 
        subsample=0.5, 
        metric="rmse"
    )

In [None]:
model.fit(X_train, y_train, early_stopping_rounds=6, eval_set=[(X_val, y_val)], verbose=10)

## SHAP (SHapley Additive exPlanations)
### Shap values are floating-point numbers corresponding to data in each row corresponding to each feature. Shap value represents the contribution of that particular data point in predicting the outputs. If the shap value is much closer to zero, we can say that the data point contributes very little to predictions. If the shap value is a strong positive or strong negative value, we can say that the data point greatly contributes to predictions.

In [None]:
# Initialize object that can calculate shap values
explainer = shap.TreeExplainer(model)

# Calculate Shap values
shap_values = explainer.shap_values(X_val)

### We will visualize summary plot to get the idea which feature mattered the most to the model.

In [None]:
shap.summary_plot(shap_values, X_val, plot_size=(20, 20))

<b>Note to study summary plot </b> - Horizontal location shows whether the effect of that value caused a higher or lower prediction and color shows whether that feature was high or low for that row of the dataset

### We can also plot a simpler summary bar plot by giving argument plot_type='bar'.

In [None]:
shap.summary_plot(shap_values, X_val, plot_type='bar', plot_size=(20, 15), color='#ff0090')

### Lastly, we will look into dependence plot. Dependence plots can be of great use while analyzing feature importance and feature selection. It plots shap values of the desired feature and colorize the dot with respect to another feature.<br> Here I will plot dependence plot of first 9 features.

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(20, 20))
ax = ax.ravel()
for i, x in enumerate(X_val.columns):
    shap.dependence_plot(x, shap_values, X_val, ax=ax[i], show=False)
    if i == 8:
        break
plt.tight_layout()
plt.show()

<a id='5'></a>
# <p style="background-color:#f3ab60;font-family:newtimeroman;color:#662e2e;font-size:130%;text-align:center;border-radius:40px 40px;">SUBMISSION</p>

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['row_id'], axis=1, inplace=True)
    pred = model.predict(test_df)
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df)

## Refernces
* https://www.kaggle.com/robikscube/fast-data-loading-and-low-mem-with-parquet-files
* https://www.kaggle.com/miingkang/check-correlation-baseline-lgbm
* https://www.kaggle.com/edwardcrookenden/eda-and-lgbm-baseline-feature-imp
* https://shap.readthedocs.io/en/latest/index.html

<div class="alert alert-block alert-info">
    <h2 align='center'>THANK YOU!!</h2>
    <h3 align='center'>Please consider upvoting the kernel if you found it useful.</h3>
</div>