In [2]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

In [4]:
train_err  = pd.read_csv('train_err_data.csv')
id_error = train_err[['user_id','errtype']].values
error = np.zeros((15000,42))
for person_idx, err in tqdm(id_error):
    # person_idx - 10000 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - 10000,err - 1] += 1

train_prob = pd.read_csv('train_problem_data.csv')
problem = np.zeros(15000)
problem[train_prob.user_id.unique()-10000] = 1 

train = pd.DataFrame(data=error)
train['problem'] = problem
del error, problem

clf = setup(data = train, target = 'problem') 

Unnamed: 0,Description,Value
0,session_id,7533
1,Target,problem
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 43)"
5,Missing Values,False
6,Numeric Features,42
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [5]:
best_5 = compare_models(sort = '', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7761,0.8002,0.4641,0.7743,0.58,0.4401,0.4671,1.428
lightgbm,Light Gradient Boosting Machine,0.7761,0.7993,0.4947,0.7481,0.5953,0.4493,0.4678,0.353
et,Extra Trees Classifier,0.775,0.7957,0.4764,0.7589,0.5852,0.4417,0.4643,0.875
rf,Random Forest Classifier,0.7705,0.7976,0.4638,0.7538,0.5738,0.4288,0.4527,0.932
ada,Ada Boost Classifier,0.7692,0.7874,0.4578,0.7538,0.5693,0.4242,0.449,0.377
xgboost,Extreme Gradient Boosting,0.7642,0.784,0.5142,0.6995,0.5922,0.4317,0.4422,1.293
lr,Logistic Regression,0.7483,0.7362,0.3526,0.7658,0.4824,0.3449,0.3904,2.156
qda,Quadratic Discriminant Analysis,0.748,0.7356,0.3735,0.7423,0.4968,0.3523,0.3894,0.043
lda,Linear Discriminant Analysis,0.7456,0.7388,0.3183,0.7963,0.4544,0.3262,0.3848,0.062
ridge,Ridge Classifier,0.7449,0.0,0.3108,0.8036,0.448,0.3219,0.3835,0.032


In [6]:
blended = blend_models(estimator_list = best_5, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.78,0.8093,0.4649,0.7869,0.5845,0.4481,0.4768
1,0.7743,0.8054,0.4921,0.743,0.5921,0.4448,0.4629
2,0.7867,0.8179,0.4757,0.8043,0.5978,0.4654,0.4951
3,0.7871,0.8139,0.4986,0.7843,0.6096,0.4731,0.496
4,0.7694,0.787,0.4635,0.7483,0.5724,0.4263,0.4492
Mean,0.7795,0.8067,0.479,0.7734,0.5913,0.4515,0.476
SD,0.0069,0.0107,0.0142,0.0237,0.0125,0.0164,0.0182


In [7]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7856,0.8091,0.4764,0.8009,0.5974,0.4639,0.4929


In [8]:
final_model = finalize_model(blended)

In [10]:
test_err  = pd.read_csv('test_err_data.csv')
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((14999,42))
for person_idx, err in tqdm(id_error):
    test_x[person_idx - 30000,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
test = pd.DataFrame(data=test_x)

100%|███████████████████████████████████████████████████████████████████| 16532648/16532648 [03:04<00:00, 89616.20it/s]


In [12]:
predictions = predict_model(final_model, data = test)

In [13]:
x = []
for i in range(len(predictions['Score'])):
  if predictions['Label'][i] =='1.0':
    x.append(predictions['Score'][i])
  else:
    x.append(1-predictions['Score'][i])

In [15]:
sample_submssion = pd.read_csv('sample_submission.csv')
sample_submssion['problem'] = x
sample_submssion.to_csv("AutoML.csv", index = False)