In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, Normalizer
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score,classification_report
import seaborn as sns
import matplotlib.pyplot as plt


#### Importing my dataset

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
train.head(10)

Unnamed: 0,customer_id,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,subscribed
0,customer_id_39075,31,admin.,married,university.degree,no,no,no,cellular,dec,...,3,999,1,failure,-2.97,46.3565,-23.1,1.711,5023.5,0
1,customer_id_34855,31,technician,single,university.degree,no,no,no,telephone,may,...,4,999,0,nonexistent,-1.77,46.4465,-32.34,2.252,5099.1,0
2,customer_id_7107,47,blue-collar,married,basic.6y,unknown,yes,no,telephone,may,...,2,999,0,nonexistent,1.13,46.997,-25.48,5.862,5191.0,0
3,customer_id_31614,36,services,married,university.degree,no,no,no,cellular,may,...,1,999,1,failure,-1.77,46.4465,-32.34,2.329,5099.1,0
4,customer_id_34878,34,admin.,single,high.school,no,no,no,cellular,may,...,9,999,0,nonexistent,-1.77,46.4465,-32.34,2.252,5099.1,0
5,customer_id_24606,48,entrepreneur,married,university.degree,no,yes,yes,cellular,nov,...,2,999,1,failure,-0.07,46.6,-29.4,5.193,5195.8,0
6,customer_id_13094,34,blue-collar,single,basic.4y,unknown,no,no,cellular,jul,...,2,999,0,nonexistent,1.43,46.959,-29.89,5.964,5228.1,0
7,customer_id_36912,34,technician,single,professional.course,no,no,no,cellular,jun,...,1,999,0,nonexistent,-2.87,46.4815,-28.56,2.217,5076.2,0
8,customer_id_27834,39,unemployed,single,university.degree,no,yes,no,cellular,mar,...,1,999,0,nonexistent,-1.77,46.4215,-35.0,2.642,5099.1,0
9,customer_id_9302,40,technician,single,professional.course,no,unknown,unknown,telephone,jun,...,3,999,0,nonexistent,1.43,47.2325,-29.26,5.969,5228.1,0


#### Counting values of train and test

In [4]:
train.subscribed.value_counts()

0    25580
1     3251
Name: subscribed, dtype: int64

In [5]:
train.shape

(28831, 22)

In [6]:
test.shape

(12357, 21)

#### getting number columns and categorical columns

In [7]:
no_cols = train.describe().columns

In [8]:
len(no_cols)

11

In [9]:
all_columns  =  test.columns
cat_col = [col for col in all_columns if col not in no_cols]

In [10]:
del cat_col[0]

#### Converting categorical columns

In [11]:
for cat in cat_col:
    lb = LabelEncoder().fit(train[cat])
    train[cat] = lb.transform(train[cat])
    test[cat] = lb.transform(test[cat])

#### Dropping unuseful data columns

In [12]:
X = train.drop('customer_id', axis = 1)
#X = X.drop('day_of_week', axis = 1)
test = test.drop('customer_id', axis = 1)
#test = test.drop('day_of_week', axis = 1)

In [13]:
X.tail(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,subscribed
28821,33,0,2,6,0,0,2,0,7,4,...,2,3,6,2,-1.07,47.3835,-35.56,2.05,4963.6,0
28822,35,1,1,1,0,2,0,0,3,3,...,1,999,0,1,1.43,46.959,-29.89,5.963,5228.1,0
28823,45,4,1,6,0,0,0,0,1,4,...,2,999,0,1,1.43,46.722,-25.27,5.966,5228.1,0
28824,60,10,0,5,0,0,2,0,1,2,...,1,999,0,1,-2.87,46.1005,-21.98,1.885,5076.2,0
28825,38,4,1,7,1,2,0,0,3,2,...,1,999,0,1,1.43,46.959,-29.89,5.964,5228.1,0
28826,60,5,1,5,1,0,0,1,6,3,...,2,999,0,1,1.13,46.997,-25.48,5.859,5191.0,0
28827,39,4,1,6,0,0,0,1,4,2,...,1,999,0,1,1.43,47.2325,-29.26,5.963,5228.1,0
28828,37,0,1,3,0,2,0,0,8,2,...,1,4,1,2,-3.37,46.2155,-18.83,1.756,5017.5,1
28829,42,4,1,6,0,2,0,1,6,4,...,2,999,0,1,1.13,46.997,-25.48,5.858,5191.0,0
28830,31,0,2,6,0,2,0,0,3,1,...,2,999,0,1,1.43,46.959,-29.89,5.962,5228.1,0


#### Scaling train dataset

In [14]:
stad = MinMaxScaler()
stad.fit(X)
new_train = stad.transform(X)

#### Dropping target column

In [15]:
y_target = X['subscribed']

In [16]:
new_X = X.drop('subscribed', axis = 1)

#### Cross validation

In [18]:
crs = KFold(n_splits = 3)

In [19]:
crs.get_n_splits(X = new_X, y= y_target)

3

In [20]:
crs.split(new_X, y_target)

<generator object _BaseKFold.split at 0x0000023CCFCB9D60>

In [21]:
new_X.shape[0]

28831

In [22]:
arr = np.zeros(new_X.shape[0])

In [23]:
arr_main = np.zeros(test.shape[0])

In [24]:
test.shape

(12357, 20)

In [25]:
p_arr = np.zeros((3, 12357))

In [None]:
from scipy import stats
for i, (train_index, test_index) in enumerate(crs.split(new_X, y_target)):
    X_train = new_X.loc[train_index]
    new_target = y_target.loc[train_index]
    X_test = new_X.loc[test_index]
    Y_test = y_target.loc[test_index]
    est = GradientBoostingClassifier(max_depth =4, random_state=42, learning_rate = 0.1)
    estim = BaggingClassifier(base_estimator= est, random_state=42)
    estim.fit(X_train, new_target)
    pred = estim.predict(X_test)
    arr[test_index] = pred
    print(f1_score(Y_test,pred,average='weighted'))
    pred_main = estim.predict(test)
    p_arr[i] = pred_main
print('Average val score:> ', f1_score(y_target,arr, average='weighted'))   
arr_main = stats.mode(p_arr, axis=0)[0][0]

0.9100830382975175
0.9116504679531913


#### Submission

In [64]:
sample_submission.subscribed = arr_main

In [65]:
sample_submission.subscribed.value_counts()

0.0    11249
1.0     1108
Name: subscribed, dtype: int64

In [67]:
sample_submission.to_csv('bagging_cv.csv',index = False)