In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Tabular Playground Series - May 2022 : Baseline with AutoML💥
</h1>
</div>

<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/25226/logos/header.png?t=2021-01-27-17-34-31" alt="">

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Data Preprocessing
</h1>
</div>

In [None]:
pd_train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
pd_test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')

x_train = pd_train.iloc[:,1:-1]
y_train = pd_train.iloc[:,-1]
x_test = pd_test.iloc[:,1:]

In [None]:
pd_train.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

fig, ax = plt.subplots(figsize=(12 , 12))
corr = x_train.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))

sns.heatmap(corr,square=True, center=0, 
            linewidth=0.2, cmap='coolwarm',
           mask=mask, ax=ax) 

ax.set_title(' Correlation Matrix ', loc='left')
plt.show()
# /opt/conda/l

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Feature Engineering
</h1>
</div>

We are going to mainly focus on 'f_27'

In [None]:
x_train['f_27'].str.split('',expand=True)

In [None]:
def split(data):
    df_split = data["f_27"].str.split('',expand=True).iloc[:,1:11]
    df_split.columns = [f'f_27_{i}' for i in range(10)]
    new_data = pd.concat([data,df_split],axis=1)
    new_data = new_data.drop(['f_27'], axis=1)
    return new_data

In [None]:
new_x_train = split(x_train)
new_x_test = split(x_test)

In [None]:
pd.set_option('display.max_columns', None)
new_x_train

In [None]:
for col in new_x_train:
    if (new_x_train[col]).dtype == 'object':
        print("train data {} : ".format(col), sorted(new_x_train[col].unique()))
        print("test data {} : ".format(col), sorted(new_x_test[col].unique()))
        print("Same unique values? : ",sorted(new_x_train[col].unique()) == sorted(new_x_test[col].unique()), '\n')

Transforming string to integer using ASCII code.

In [None]:
for col in new_x_train:
    if (new_x_train[col]).dtype == 'object':
        new_x_train[col] = [ ord(x) - 64 for x in new_x_train[col] ]
        new_x_test[col] = [ ord(x) - 64 for x in new_x_test[col] ]

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Visualizing with 3D t-SNE
</h1>
</div>

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go


tsne = TSNE(random_state = 1004, n_components=3, verbose=0, perplexity=40, n_iter=1000).fit_transform(new_x_train[:10000])

y_train_label = y_train[:10000].astype(str)
fig = px.scatter_3d(new_x_train[:10000], x=tsne[:,0],
                 y=tsne[:,1],
                 z=tsne[:,2],
                 color=y_train_label)
fig.update_layout(
                margin=dict(l=20, r=20, t=20, b=20)
)
fig.update_traces(marker_size=2)
iplot(fig)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
new_x_train = scaler.fit_transform(new_x_train)
new_x_test = scaler.transform(new_x_test)

In [None]:
pd.DataFrame(new_x_train)

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
AutoML
</h1>
</div>

In [None]:
!pip install -q -U git+https://github.com/mljar/mljar-supervised.git@master

In [None]:
from supervised import AutoML

automl = AutoML(total_time_limit=60*180,
                mode = "Compete",
                eval_metric="auc",
                algorithms = ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network'],
                ml_task = 'binary_classification',
                train_ensemble=True,
                n_jobs = -1,
                validation_strategy = {"validation_type": "split",
                                        "train_ratio": 0.8,
                                        "shuffle": True,
                                        "stratify": True},
                random_state = 1004
               )

automl.fit(new_x_train, y_train)

In [None]:
automl.report()

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:skyblue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
Predict for Test Dataset
</h1>
</div>

In [None]:
pred = automl.predict_all(new_x_test)
pred.head()

In [None]:
submit = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')
submit['target'] = pred['label']
submit.to_csv('submission.csv',index=False, header=True)