In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import probplot

import warnings

warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv('/kaggle/input/../input/tabular-playground-series-mar-2021/train.csv')
test=pd.read_csv('/kaggle/input/../input/tabular-playground-series-mar-2021/test.csv')
submit=pd.read_csv('/kaggle/input/../input/tabular-playground-series-mar-2021/sample_submission.csv')

In [None]:
def datainfo(df):
    return pd.DataFrame([(col,df[col].nunique(),df[col].dtype,df[col].isna().sum(),
                         df[col].unique()[:5]) for col in df.columns],
                        columns=['name','nunique','dtype','missing','value:5'])
datainfo(train)

In [None]:
cat_cols=[col for col in train.columns if 'cat' in col]
num_cols=[col for col in train.columns if 'cont' in col]

In [None]:
fig,ax=plt.subplots(4,5,figsize=(20,16),dpi=200)

for idx,col in enumerate(cat_cols):
    temp=train[col].value_counts()
    
    ax[idx//5][idx%5].bar(x=temp.index,height=temp)
    ax[idx//5][idx%5].set_title(f'{col} distribution')
plt.show()

In [None]:
모수를 기반 가정 선형모델 모델 linear regression
가정 없이 비모수 모델 randomforest 

In [None]:
fig,ax=plt.subplots(6,4,figsize=(16,24),dpi=200)
idx=0

for col in num_cols:
    temp=train[col]
    
    sns.kdeplot(temp,ax=ax[idx//4][idx%4])
    ax[idx//4][idx%4].set_title(f'{col} distribution')
    
    idx+=1
    
    probplot(temp,plot=ax[idx//4][idx%4])
    ax[idx//4][idx%4].set_title(f'{col} qqplot')
    
    idx+=1
    
plt.tight_layout()
plt.show()

In [None]:
sns.catplot(x='target',data=train,kind='count');

# feature engineering

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
def gmm_class(feature,n):
    gmm=GaussianMixture(n_components=n,random_state=71)
    gmm.fit(train[feature].values.reshape(-1,1))

    train[f'{feature}_class']=gmm.predict(train[feature].values.reshape(-1,1))
    test[f'{feature}_class']=gmm.predict(test[feature].values.reshape(-1,1))
    

gmm_class('cont0',4)
gmm_class('cont1',7)
gmm_class('cont2',6)
gmm_class('cont3',3)
gmm_class('cont4',6)
gmm_class('cont5',10)
gmm_class('cont6',6)
gmm_class('cont7',4)
gmm_class('cont8',4)
gmm_class('cont9',6)
gmm_class('cont10',5)

# model - benchmark

In [None]:
import category_encoders as ce 

for col in cat_cols:
    # 새로 정의 안하면 계속 누적되나보다.
    oe=ce.OrdinalEncoder()
    oe.fit(train[col])
    train[col]=oe.transform(train[col])
    test[col]=oe.transform(test[col])

In [None]:
from sklearn.model_selection import train_test_split

# class_cols=[col for col in train.columns if 'class' in col]

y=train['target']
X=train.drop(['id','target'],axis=1)

X_tr,X_val,y_tr,y_val=train_test_split(X,y,test_size=0.2,random_state=71)

In [None]:
import xgboost as xgb

base_params={
    'use_label_encoder':False,
    'random_state':71
}

model=xgb.XGBClassifier(**base_params)

model.fit(X_tr,y_tr,
          eval_set=[(X_tr,y_tr),(X_val,y_val)],
          eval_metric='logloss',
          verbose=True)

In [None]:
from sklearn.metrics import log_loss

pred=model.predict(X_val)
print(f'score : {log_loss(pred,y_val)}')

In [None]:
test=test.drop('id',axis=1)

pred=model.predict(test)

submit['target']=pred
submit.head()

In [None]:
# submit.to_csv('all_gmmclass_submission.csv',index=False)