In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
df_train.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [4]:
df_train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [5]:
import statsmodels.formula.api as smf

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

In [7]:
df_train_log = df_train.copy()
df_train_log['y_0'] = df_train_log['y'].map({'yes':1, 'no':0})

column = ['job', 'marital', 'education', 'default','housing','loan', 'contact', 'day', 'month','campaign', 'pdays','previous', 'poutcome','y']

df_train_log_c = df_train_log.drop(columns=column)

df_train_log_c.head()

Unnamed: 0,age,balance,duration,y_0
0,58,2143,261,0
1,44,29,151,0
2,33,2,76,0
3,47,1506,92,0
4,33,1,198,0


In [8]:
formula = 'y_0 ~' + '+'.join([col for col in df_train_log_c.columns if col !='y_0'])

logistic_regression = smf.logit(formula, data=df_train_log_c)
logistic_regression_result = logistic_regression.fit()
print(logistic_regression_result.summary())

Optimization terminated successfully.
         Current function value: 0.302937
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                    y_0   No. Observations:                45211
Model:                          Logit   Df Residuals:                    45207
Method:                           MLE   Df Model:                            3
Date:                Sat, 16 Dec 2023   Pseudo R-squ.:                  0.1605
Time:                        21:46:33   Log-Likelihood:                -13696.
converged:                       True   LL-Null:                       -16315.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -3.5539      0.066    -53.471      0.000      -3.684      -3.424
age            0.0073      0.

In [9]:
df_test_log = df_test.copy()
df_test_log['pred_y_0'] = logistic_regression_result.predict(df_test_log)

In [10]:
y_pred = df_test_log['pred_y_0'] > 0.5
y = df_test_log['y'].map({'yes':True, 'no':False})

In [11]:
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y,y_pred)
f1_score = (2 * precision * recall) / (precision + recall)
auc = roc_auc_score(y, y_pred)

print(f'Accuracy: {accuracy:.3f} Precision: {precision:.3f} Recall: {recall:.3f} f1_score: {f1_score:.3f} AUC: {auc:.3f}')

Accuracy: 0.889 Precision: 0.561 Recall: 0.167 f1_score: 0.257 AUC: 0.575


In [12]:
df_train_rf = df_train.copy()
df_train_rf['y_0'] = df_train_rf['y'].map({'yes':1, 'no':0})
X = df_train_rf.drop(['y', 'y_0'], axis=1)
X = pd.get_dummies(X)
y = df_train_rf['y_0']


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
from sklearn.ensemble import RandomForestClassifier
drf = RandomForestClassifier(n_estimators = 100)
drf.fit(X_train, y_train)

In [24]:
y_pred = drf.predict(X_val)

In [25]:
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1_score = (2 * precision * recall) / (precision + recall)
auc = roc_auc_score(y_val, y_pred)

print(f'Accuracy: {accuracy:.3f} Precision: {precision:.3f} Recall: {recall:.3f} f1_score: {f1_score:.3f} AUC: {auc:.3f}')

Accuracy: 0.905 Precision: 0.674 Recall: 0.419 f1_score: 0.517 AUC: 0.696


In [26]:
df_test_rf = df_test.copy()
df_test_rf['y_0'] = df_test_rf['y'].map({'yes':1, 'no':0})
X_test = df_test_rf.drop(['y', 'y_0'], axis=1)
X_test = pd.get_dummies(X_test)
y_test = df_test_rf['y'].map({'yes':1, 'no':0})

In [33]:
y_pred = drf.predict(X_test)
print(y_pred.sum(), len(y_pred))

492 4521


In [28]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_score = (2 * precision * recall) / (precision + recall)
auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.3f} Precision: {precision:.3f} Recall: {recall:.3f} f1_score: {f1_score:.3f} AUC: {auc:.3f}')


Accuracy: 0.982 Precision: 0.947 Recall: 0.894 f1_score: 0.920 AUC: 0.944
