In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 🚀 Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)

# 🚀 Loading Train and Test dataset

## 🛰️ Loading test dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
train.head()

## 🛰️ Loading test datase

In [None]:
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
test.head()

## 🛰️ Loading sample submission dataset

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')
sample_submission.head()

# 🚀 Data Preprocessing

## 🛰️ Checking for null values in train dataset

In [None]:
train.isnull().sum()

## 🛰️ Checking for null values in test dataset

In [None]:
test.isnull().sum()

## 🛰️ Check data information

In [None]:
train.info()

## 🛰️ Check train data shape

In [None]:
train.shape

## 🛰️ Check test data shape

In [None]:
test.shape

## 🛰️ Drop 'id' column from train dataset

In [None]:
train.drop('id', axis=1, inplace = True)

## 🛰️ Drop 'id' column from test datase

In [None]:
test.drop('id', axis=1, inplace = True)

## 🛰️ Convert all feature type from int to object for train dataset

In [None]:
for i in train.columns:
    train[i].astype('object')

## 🛰️ Convert all feature type from int to object for test dataset

In [None]:
for i in test.columns:
    test[i].astype('object')

# 🚀 Exploratory Data Analysis (EDA)

## 🛰️ Verify the values under each feature

In [None]:
for i in train.columns:
    print(train[i].value_counts())

## 🛰️ Check the data distribution

In [None]:
for i in train.columns:
    sns.countplot(train[i])
    plt.show()

## 🛰️ Separating dependent and independent variables

In [None]:
X = train.iloc[:,0:50]
y = train.iloc[:,50:]

y = np.ravel(y)

## 🛰️ Label Encoding target variable

In [None]:
label = LabelEncoder()
y = label.fit_transform(y)

## 🛰️ Splitting the data into training and validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42)

In [None]:
skfold = StratifiedKFold(n_splits=5)

# 🚀 Building Model Pipeline

In [None]:
pipe = Pipeline([
    #('scale',StandardScaler()),
    ('cat',CatBoostClassifier(classes_count=4))
])

In [None]:
cross_score = cross_val_score(pipe,X_train,y_train,cv=skfold)
print(np.mean(cross_score))

# 🚀 Hyperparamaters Tuning

In [None]:
# parameters = {   'cat__depth'         : [4,5,6,7,8,9, 10],
#                  'cat__learning_rate' : [0.01,0.02,0.03,0.04],
#                  'cat__iterations'    : [10, 20,30,40,50,60,70,80,90,100]
#                  }

In [None]:
# cat_search = GridSearchCV(pipe, parameters)

In [None]:
# cat_search.fit(X_train,y_train)

## 🛰️ Predicting probabilities for all classes for Catboost algorithm

In [None]:
pipe.fit(X_train,y_train)

In [None]:
predictions = pipe.predict(X_test)
print(predictions)

In [None]:
probability_predictions = pipe.predict_proba(X_test)
probability_predictions

## 🛰️ Predicting probabilities on test dataset using Catboost algorithm

In [None]:
probability_predictions = pipe.predict_proba(test)
probability_predictions

In [None]:
sample_submission.iloc[:,1:5] = probability_predictions
sample_submission

In [None]:
sample_submission.to_csv('submission_cat.csv',index=False)