In [3]:
import pycaret
from pycaret.classification import *
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('train_ver_2.csv')
test = pd.read_csv('test_ver_2.csv')

In [4]:
train, val = train_test_split(train, test_size=0.2, random_state=42)

In [5]:
train.shape

(1412, 21)

In [6]:
train['Response'].value_counts()

Response
0    1204
1     208
Name: count, dtype: int64

In [7]:
val['Response'].value_counts()

Response
0    297
1     57
Name: count, dtype: int64

In [12]:
test['Response'].value_counts()

Response
0    376
1     66
Name: count, dtype: int64

In [13]:
train_smote = setup(data = train, target = 'Response', fix_imbalance = True)

Unnamed: 0,Description,Value
0,Session id,2791
1,Target,Response
2,Target type,Binary
3,Original data shape,"(1412, 21)"
4,Transformed data shape,"(2108, 29)"
5,Transformed train set shape,"(1684, 29)"
6,Transformed test set shape,"(424, 29)"
7,Numeric features,18
8,Categorical features,2
9,Preprocess,True


In [14]:
get_config('y_transformed').value_counts()

Response
0    1204
1     904
Name: count, dtype: int64

In [15]:
over_sam_train = pd.concat([get_config('X_transformed'), get_config('y_transformed')], axis=1)
over_sam_train.head()

Unnamed: 0,Education_Graduation,Education_2n Cycle,Education_Master,Education_PhD,Education_Basic,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_Divorced,...,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Age,Response
1274,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,8.0,24.0,3.0,2.0,1.0,3.0,6.0,0.0,55.0,0
1733,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,8.0,1.0,1.0,0.0,3.0,6.0,0.0,43.0,0
1728,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,144.0,96.0,1.0,3.0,4.0,10.0,1.0,0.0,32.0,1
1511,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,7.0,57.0,1.0,9.0,2.0,9.0,5.0,0.0,51.0,0
1094,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,6.0,29.0,3.0,10.0,4.0,5.0,8.0,0.0,67.0,0


In [16]:
val_smote = setup(data = val, target = 'Response', fix_imbalance = False)

Unnamed: 0,Description,Value
0,Session id,7584
1,Target,Response
2,Target type,Binary
3,Original data shape,"(354, 21)"
4,Transformed data shape,"(354, 29)"
5,Transformed train set shape,"(247, 29)"
6,Transformed test set shape,"(107, 29)"
7,Numeric features,18
8,Categorical features,2
9,Preprocess,True


In [17]:
get_config('y_transformed').value_counts()

Response
0    297
1     57
Name: count, dtype: int64

In [18]:
over_sam_val = pd.concat([get_config('X_transformed'), get_config('y_transformed')], axis=1)
over_sam_val.head()

Unnamed: 0,Education_PhD,Education_Master,Education_Graduation,Education_Basic,Education_2n Cycle,Marital_Status_Together,Marital_Status_Married,Marital_Status_Divorced,Marital_Status_Single,Marital_Status_Widow,...,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Age,Response
123,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.0,15.0,4.0,3.0,0.0,4.0,4.0,0.0,65.0,0
76,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,144.0,1.0,8.0,5.0,4.0,7.0,0.0,54.0,1
1313,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,92.0,15.0,1.0,4.0,6.0,13.0,2.0,0.0,52.0,0
1023,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,38.0,172.0,1.0,4.0,10.0,5.0,3.0,0.0,35.0,0
1269,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,123.0,114.0,2.0,4.0,7.0,13.0,2.0,0.0,56.0,0


In [19]:
test_smote = setup(data = test, target = 'Response', fix_imbalance = False)

Unnamed: 0,Description,Value
0,Session id,7072
1,Target,Response
2,Target type,Binary
3,Original data shape,"(442, 21)"
4,Transformed data shape,"(442, 29)"
5,Transformed train set shape,"(309, 29)"
6,Transformed test set shape,"(133, 29)"
7,Numeric features,18
8,Categorical features,2
9,Preprocess,True


In [20]:
get_config('y_transformed').value_counts()

Response
0    376
1     66
Name: count, dtype: int64

In [21]:
over_sam_test = pd.concat([get_config('X_transformed'), get_config('y_transformed')], axis=1)
over_sam_test.head()

Unnamed: 0,Education_PhD,Education_Graduation,Education_Master,Education_2n Cycle,Education_Basic,Marital_Status_Together,Marital_Status_Single,Marital_Status_Married,Marital_Status_Widow,Marital_Status_Divorced,...,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Age,Response
308,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,47.0,0
385,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,49.0,49.0,1.0,6.0,7.0,12.0,3.0,0.0,76.0,0
38,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,20.0,50.0,2.0,3.0,5.0,7.0,7.0,0.0,47.0,1
270,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,102.0,102.0,10.0,4.0,6.0,9.0,7.0,0.0,53.0,0
61,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,82.0,71.0,1.0,9.0,3.0,13.0,4.0,0.0,76.0,0


In [23]:
over_sam_train['Response'].value_counts(),over_sam_val['Response'].value_counts(), over_sam_test['Response'].value_counts()

(Response
 0    1204
 1     904
 Name: count, dtype: int64,
 Response
 0    297
 1     57
 Name: count, dtype: int64,
 Response
 0    376
 1     66
 Name: count, dtype: int64)

In [24]:
over_sam_train.to_csv('train_SMOTE.csv', index=False)
over_sam_val.to_csv('val_SMOTE.csv', index=False)
over_sam_test.to_csv('test_SMOTE.csv', index=False)