# **<center style="font-family:poppins;color:#0456f3;">1. Import Packages 📦</center>**

In [None]:
import random
random.seed(10)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

## setting the display style
plt.style.use('fivethirtyeight') 

# settings to display all columns
pd.set_option("display.max_columns", None)

In [None]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')

# **<center style="font-family:poppins;color:#0456f3;">2. 📉 Reduce Memory Usage</center>**

This data is very large and takes huge memory in kaggle. While training the model with this data, it will cross the maximum allocated RAM which is 13 GB for kaggle notebook. So, the data's size should be reduced to overcome this problem.

In [None]:
# function to reduce the memory usage
def reduce_mem_usage(train_data):
    for col in train_data.columns:
        col_type = train_data[col].dtype
        
        if col_type != object:
            c_min = train_data[col].min()
            c_max = train_data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    train_data[col] = train_data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    train_data[col] = train_data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    train_data[col] = train_data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    train_data[col] = train_data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    train_data[col] = train_data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    train_data[col] = train_data[col].astype(np.float32)
                else:
                    train_data[col] = train_data[col].astype(np.float64)
        else:
            train_data[col] = train_data[col].astype('category')

    end_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return train_data

In [None]:
## memory usage of train_df before memory usage reduction
start_mem = train_df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

In [None]:
train_df = reduce_mem_usage(train_df)

# **<center style="font-family:poppins;color:#0456f3;">3. Basic Information ℹ️ about Data</center>**

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.head()

# **<center style="font-family:poppins;color:#0456f3;">4. EDA 🔍</center>**

## **<center style="font-family:poppins;color:#0456f3;">4.1 Missing Values</center>**

In [None]:
print('Missing train data : {:d}'.format(train_df.isnull().values.sum()))

In [None]:
print('Missing test data : {:d}'.format(test_df.isnull().values.sum()))

Both the train_df and test_df has no missing values. So,no 🙅‍♂️ need to worry about data imputation.  

In [None]:
train_df.head()

From the above table we can see some features are continuous real numbers and  others are binary.

In [None]:
## Listing the feature columns and ignoring the id column as it is just a unique identifier which shouldn't be used for model training
features = [col for col in train_df.columns if col not in ['target', 'id']]

# **<center style="font-family:poppins;color:#0456f3;">5. Feature Distribution</center>**

In [None]:
%%time
cat_features=[]
cont_features=[]
for feature in features:
    if train_df.dtypes[feature]=='int8':
        cat_features.append(feature)
    if train_df.dtypes[feature]=='float16':
        cont_features.append(feature)
    #print(test.dtypes[feature])
print('features obtained')

plt.bar([1,2],[len(cat_features),len(cont_features)])
plt.xticks([1,2],('Categorical','Continuous'))
plt.show()

In [None]:
print('Categorical Features : {:d}'.format(len(cat_features)))
print('Continuous Features : {:d}'.format(len(cont_features)))

## **<center style="font-family:poppins;color:#0456f3;">5.1 Distribution of Categorical Data</center>**

In [None]:
train_df[cat_features].iloc[:,:44].hist(figsize=(32, 32),sharey=True)
plt.show()

## **<center style="font-family:poppins;color:#0456f3;">5.2 Distribution of Continuous Data</center>**

In [None]:
train_df[cont_features].iloc[:,:79].hist(figsize=(32, 32),sharey=True)
plt.tight_layout()
plt.show()

In [None]:
train_df[cont_features].iloc[:,79:159].hist(figsize=(32, 32),sharey=True)
plt.tight_layout()
plt.show()

In [None]:
train_df[cont_features].iloc[:,160:].hist(figsize=(32, 32),sharey=True)
plt.tight_layout()
plt.show()

# **<center style="font-family:poppins;color:#0456f3;">6. Distribution of the Target 🎯</center>**

In [None]:
target_df = train_df.target.value_counts() / len(train_df)
labels = ['1','0']

target_df

In [None]:
plt.bar([1,2],target_df)
plt.xticks([1,2],('1','0'))
plt.show()

The distribution of the target value **[0,1]** is equal.

# **<center style="font-family:poppins;color:#0456f3;">7. Training and Validation Data Set Split</center>**

In [None]:
## Create validation set from training set
X = train_df.copy()
y = X.pop('target')
X = X.drop('id',axis=1)

In [None]:
# stratify - making sure classes are evenly represented across train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X,y, stratify = y, test_size = 0.1)
input_shape = [X_train.shape[1]]

# **<center style="font-family:poppins;color:#0456f3;">8. Define Light GBM Model</center>**

Here, I train the Light GBM model using the ***X_train*** data with validation set and ***early_stopping_rounds*** of 100.

In [None]:
eval_set = [(X_valid[features],y_valid)]
lgbm_model = LGBMClassifier(objective="binary")
print('LGBM parameters:\n',lgbm_model.get_params())

lgbm_model.fit(X[features], y,
               eval_set = eval_set,
               early_stopping_rounds=100,
               eval_metric="binary_logloss")

# **<center style="font-family:poppins;color:#0456f3;">9. Feature Importance</center>**

Let's visualize the importance of features

In [None]:
import seaborn as sns
feature_imp = pd.DataFrame(sorted(zip(lgbm_model.feature_importances_,X_train[features].columns)), columns=['Value','Feature'])

plt.figure(figsize=(16, 44), tight_layout=True)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

**f58** and **f69** are feature with highest importance and large number of feature has no importance in this model.

In [None]:
sub = lgbm_model.predict_proba(test_df[features])
sub = pd.DataFrame(sub[:,1],columns=["target"])
sub["id"]= test_df["id"]
sub = sub[['id', 'target']]
sub.head()

In [None]:
## saving the submission file
sub.to_csv('./submission_01.csv', header=True, index=False)


# **<center style="font-family:poppins;color:#0456f3;">10. Conclusion 📔</center>**



1. This data is big  with  ***1 million rows*** and ***287*** columns.
2. The target is ***highly balanced***.
3. There are ***no missing values***  in train and test data.
4. There are 45 categorical features and 240 continuous features. 
5. Didn't do ***feature engineering*** for my first run.
6. Trained LightGBM model with no hyperparameter tunning for first run.
7. ***f58*** has the highest feature importance.
8. Large number of columns has no importance in this model.
9. Got Score of ***0.84842***.

## **<center style="font-family:poppins;color:#0456f3;">10.1 Future Plans 📅</center>**

1. Will use the feature importances plot for selection of the features  for  future runs.
2. Do feature engineering to improve the performance of model.
5. Try XGB, CATBoost and Neural Networks and  see the performance of models.


## **<center style="font-family:poppins;color:#0456f3;">10.2 References</center>**

1. https://www.kaggle.com/dwin183287/tps-september-2021-eda
2. https://www.kaggle.com/subinium/tps-oct-simple-eda
3. https://www.kaggle.com/questions-and-answers/148011


<center style="font-family:cursive; font-size:18px; color:#0456f3;">Thank you 🙏 for reading. If you have any feedback or find anything wrong, please let me know. I hope you enjoy it.Happy Learning 🙋‍♂️</center>