# Importing required libraries

In [1]:
import pandas as pd

# Loading the dataset

In [2]:
!pip install ucimlrepo



In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# metadata 
print(bank_marketing.metadata) 

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'title': 'A data-driven approach to predict the success of bank telemarketing'

In [4]:
# variable information 
print(bank_marketing.variables) 

           name     role         type      demographic  \
0           age  Feature      Integer              Age   
1           job  Feature  Categorical       Occupation   
2       marital  Feature  Categorical   Marital Status   
3     education  Feature  Categorical  Education Level   
4       default  Feature       Binary             None   
5       balance  Feature      Integer             None   
6       housing  Feature       Binary             None   
7          loan  Feature       Binary             None   
8       contact  Feature  Categorical             None   
9   day_of_week  Feature         Date             None   
10        month  Feature         Date             None   
11     duration  Feature      Integer             None   
12     campaign  Feature      Integer             None   
13        pdays  Feature      Integer             None   
14     previous  Feature      Integer             None   
15     poutcome  Feature  Categorical             None   
16            

In [5]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,


In [6]:
X.insert(15,'Subscribed',y)

# Null Handling
2 types of null values:
1. MCAR - missing completely at random(such rows/columns are dropped)
2. Non MCAR - the missing values are not random(techniques like column-mean or column-most_frequent are used to impute the missing. best way is to train an ml model to predict the missing values)

In [7]:
# since contact values are MCAR, the column can be dropped
X = X.drop('contact',axis=1)

In [None]:
# But poutcome is non MCAR hence an XGBClassifier will be used to predict the null values

# X = K(poutcome is not null) + Z(poutcome is null)

A classifier will be trained on the K dataset and it will predict Z poutcome

In [8]:
K = X[X['poutcome'].notnull()]
K['poutcome'].unique()

array(['failure', 'other', 'success'], dtype=object)

In [9]:
req = K.drop(['poutcome','Subscribed'],axis=1)

In [10]:
req.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous
24060,33,admin.,married,tertiary,no,882,no,no,21,oct,39,1,151,3
24062,42,admin.,single,secondary,no,-247,yes,yes,21,oct,519,1,166,1
24064,33,services,married,secondary,no,3444,yes,no,21,oct,144,1,91,4
24072,36,management,married,tertiary,no,2415,yes,no,22,oct,73,1,86,4
24077,36,management,married,tertiary,no,0,yes,no,23,oct,140,1,143,3


In [11]:
K.loc[K['poutcome']=='success','poutcome']=2
K.loc[K['poutcome']=='failure','poutcome']=0
K.loc[K['poutcome']=='other','poutcome']=1

In [12]:
K.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous,Subscribed,poutcome
24060,33,admin.,married,tertiary,no,882,no,no,21,oct,39,1,151,3,no,0
24062,42,admin.,single,secondary,no,-247,yes,yes,21,oct,519,1,166,1,yes,1
24064,33,services,married,secondary,no,3444,yes,no,21,oct,144,1,91,4,yes,0
24072,36,management,married,tertiary,no,2415,yes,no,22,oct,73,1,86,4,no,1
24077,36,management,married,tertiary,no,0,yes,no,23,oct,140,1,143,3,yes,0


In [13]:
req['job'] = req['job'].astype("category")
req['marital'] = req['marital'].astype("category")
req['education'] = req['education'].astype("category")
req['housing'] = req['housing'].astype("category")
req['loan'] = req['loan'].astype("category")
req['default'] = req['default'].astype("category")
req['month'] = req['month'].astype("category")

In [14]:
import xgboost as xgb
pout_predictor = xgb.XGBClassifier(enable_categorical=True,tree_method = 'hist', objective="multi:softprob")
pout_predictor.fit(req,K['poutcome'])

In [15]:
Z = X[X['poutcome'].isna()]
Z = Z.drop('poutcome',axis = 1)

In [16]:
Z['job'] = Z['job'].astype("category")
Z['marital'] = Z['marital'].astype("category")
Z['education'] = Z['education'].astype("category")
Z['housing'] = Z['housing'].astype("category")
Z['loan'] = Z['loan'].astype("category")
Z['default'] = Z['default'].astype("category")
Z['month'] = Z['month'].astype("category")

In [17]:
Z.shape

(36959, 15)

In [18]:
i = pout_predictor.predict(Z.drop('Subscribed',axis=1))
i

array([2, 2, 0, ..., 2, 2, 0], dtype=int64)

In [19]:
Z.insert(14,'poutcome',i)

In [20]:
Z.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous,poutcome,Subscribed
0,58,management,married,tertiary,no,2143,yes,no,5,may,261,1,-1,0,2,no
1,44,technician,single,secondary,no,29,yes,no,5,may,151,1,-1,0,2,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,76,1,-1,0,0,no
3,47,blue-collar,married,,no,1506,yes,no,5,may,92,1,-1,0,2,no
4,33,,single,,no,1,no,no,5,may,198,1,-1,0,0,no


In [21]:
Z.loc[Z['poutcome']==2,'poutcome']='success'
Z.loc[Z['poutcome']==0,'poutcome']='failure'
Z.loc[Z['poutcome']==1,'poutcome']='other'

In [22]:
Z.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous,poutcome,Subscribed
0,58,management,married,tertiary,no,2143,yes,no,5,may,261,1,-1,0,success,no
1,44,technician,single,secondary,no,29,yes,no,5,may,151,1,-1,0,success,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,76,1,-1,0,failure,no
3,47,blue-collar,married,,no,1506,yes,no,5,may,92,1,-1,0,success,no
4,33,,single,,no,1,no,no,5,may,198,1,-1,0,failure,no


In [23]:
K.loc[K['poutcome']==2,'poutcome']='success'
K.loc[K['poutcome']==0,'poutcome']='failure'
K.loc[K['poutcome']==1,'poutcome']='other'

In [24]:
K.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous,Subscribed,poutcome
24060,33,admin.,married,tertiary,no,882,no,no,21,oct,39,1,151,3,no,failure
24062,42,admin.,single,secondary,no,-247,yes,yes,21,oct,519,1,166,1,yes,other
24064,33,services,married,secondary,no,3444,yes,no,21,oct,144,1,91,4,yes,failure
24072,36,management,married,tertiary,no,2415,yes,no,22,oct,73,1,86,4,no,other
24077,36,management,married,tertiary,no,0,yes,no,23,oct,140,1,143,3,yes,failure


In [25]:
X_req = pd.concat([Z,K])
X_req

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous,poutcome,Subscribed
0,58,management,married,tertiary,no,2143,yes,no,5,may,261,1,-1,0,success,no
1,44,technician,single,secondary,no,29,yes,no,5,may,151,1,-1,0,success,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,76,1,-1,0,failure,no
3,47,blue-collar,married,,no,1506,yes,no,5,may,92,1,-1,0,success,no
4,33,,single,,no,1,no,no,5,may,198,1,-1,0,failure,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45199,34,blue-collar,single,secondary,no,1475,yes,no,16,nov,1166,3,530,12,other,no
45201,53,management,married,tertiary,no,583,no,no,17,nov,226,1,184,4,success,yes
45204,73,retired,married,secondary,no,2850,no,no,17,nov,300,1,40,8,failure,yes
45208,72,retired,married,secondary,no,5715,no,no,17,nov,1127,5,184,3,success,yes


In [26]:
X_req.sort_index(axis=1)
X_req

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous,poutcome,Subscribed
0,58,management,married,tertiary,no,2143,yes,no,5,may,261,1,-1,0,success,no
1,44,technician,single,secondary,no,29,yes,no,5,may,151,1,-1,0,success,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,76,1,-1,0,failure,no
3,47,blue-collar,married,,no,1506,yes,no,5,may,92,1,-1,0,success,no
4,33,,single,,no,1,no,no,5,may,198,1,-1,0,failure,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45199,34,blue-collar,single,secondary,no,1475,yes,no,16,nov,1166,3,530,12,other,no
45201,53,management,married,tertiary,no,583,no,no,17,nov,226,1,184,4,success,yes
45204,73,retired,married,secondary,no,2850,no,no,17,nov,300,1,40,8,failure,yes
45208,72,retired,married,secondary,no,5715,no,no,17,nov,1127,5,184,3,success,yes


In [27]:
X_req.isna().sum()

age               0
job             288
marital           0
education      1857
default           0
balance           0
housing           0
loan              0
day_of_week       0
month             0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
Subscribed        0
dtype: int64

In [28]:
from sklearn.impute import SimpleImputer
categorical_features = ['job','education']
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_req[categorical_features] = categorical_imputer.fit_transform(X_req[categorical_features])

In [29]:
X_req.loc[X_req['Subscribed']=='yes','Subscribed']= 1
X_req.loc[X_req['Subscribed']=='no','Subscribed']= 0


In [30]:
X_req['job'] = X_req['job'].astype("category")
X_req['marital'] = X_req['marital'].astype("category")
X_req['education'] = X_req['education'].astype("category")
X_req['housing'] = X_req['housing'].astype("category")
X_req['loan'] = X_req['loan'].astype("category")
X_req['default'] = X_req['default'].astype("category")
X_req['month'] = X_req['month'].astype("category")
X_req['poutcome'] = X_req['poutcome'].astype("category")

In [31]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_req.drop('Subscribed',axis=1), X_req['Subscribed'], train_size=0.8, random_state=42)

In [32]:
import xgboost as xgb
classifier_rf = xgb.XGBClassifier(enable_categorical=True,tree_method = 'hist', objective="binary:logistic")
classifier_rf.fit(X_train, y_train)

In [33]:
prediction = classifier_rf.predict(X_test)
prediction.shape

(9043,)

In [34]:
df = pd.DataFrame({
    'Actual': y_test,
    'Predicted':prediction
}
)

In [35]:
df

Unnamed: 0,Actual,Predicted
3776,0,0
9928,0,0
38047,0,0
35317,0,0
15738,0,0
...,...,...
13353,0,0
30441,0,0
5654,0,0
3779,0,0


In [36]:
df['Actual'] = df['Actual'].astype('int64')
df['Predicted'] = df['Predicted'].astype('int64')

In [37]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
print(f"acuracy of the model is : {accuracy_score(df['Actual'], df['Predicted'])*100:.2f}%")

acuracy of the model is : 90.56%


In [38]:
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "model.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(classifier_rf, file)
