In [None]:
import pandas as pd
from flaml import AutoML
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv('bank-additional-full.csv', na_values='unknown')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,is_telephone_contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,y
0,56,housemaid,married,basic.4y,no,no,no,yes,may,mon,261,1,999,0,nonexistent,no
1,57,services,married,high.school,,no,no,yes,may,mon,149,1,999,0,nonexistent,no
2,37,services,married,high.school,no,yes,no,yes,may,mon,226,1,999,0,nonexistent,no
3,40,admin.,married,basic.6y,no,no,no,yes,may,mon,151,1,999,0,nonexistent,no
4,56,services,married,high.school,no,no,yes,yes,may,mon,307,1,999,0,nonexistent,no


In [None]:
df.isna().sum()

age                        0
job                      330
marital                   80
education               1731
default                 8597
housing                  990
loan                     990
is_telephone_contact       0
month                      0
day_of_week                0
duration                   0
campaign                   0
pdays                      0
previous                   0
poutcome                   0
y                          0
dtype: int64

In [None]:
#TO-DO: fill `categorical_cols` and fill na values in categorical columns of `df` dataframe.
categorical_cols = [ 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'is_telephone_contact', 'month', 'day_of_week', 'poutcome']
df = df.fillna({'job':df["job"].mode()[0],'education':df["education"].mode()[0],'marital':df["marital"].mode()[0],'default':df["default"].mode()[0],'housing':df["housing"].mode()[0],'loan':df["loan"].mode()[0]})
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,is_telephone_contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,y
0,56,housemaid,married,basic.4y,no,no,no,yes,may,mon,261,1,999,0,nonexistent,no
1,57,services,married,high.school,no,no,no,yes,may,mon,149,1,999,0,nonexistent,no
2,37,services,married,high.school,no,yes,no,yes,may,mon,226,1,999,0,nonexistent,no
3,40,admin.,married,basic.6y,no,no,no,yes,may,mon,151,1,999,0,nonexistent,no
4,56,services,married,high.school,no,no,yes,yes,may,mon,307,1,999,0,nonexistent,no


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   age                   41188 non-null  int64 
 1   job                   41188 non-null  object
 2   marital               41188 non-null  object
 3   education             41188 non-null  object
 4   default               41188 non-null  object
 5   housing               41188 non-null  object
 6   loan                  41188 non-null  object
 7   is_telephone_contact  41188 non-null  object
 8   month                 41188 non-null  object
 9   day_of_week           41188 non-null  object
 10  duration              41188 non-null  int64 
 11  campaign              41188 non-null  int64 
 12  pdays                 41188 non-null  int64 
 13  previous              41188 non-null  int64 
 14  poutcome              41188 non-null  object
 15  y                     41188 non-null

In [None]:
continuous_cols = ['age', 'duration' ,'campaign','pdays','previous']
binary_cols = [ 'default', 'housing', 'loan','is_telephone_contact']
nominal_cols = [ 'job', 'marital']
ordinal_cols =[  'education',  'month', 'day_of_week', 'poutcome']

In [None]:
binary_df = df[binary_cols].copy()

In [None]:
binary_df['default']=binary_df['default'].map({'no':0 ,'yes': 1})
binary_df['housing']=binary_df['housing'].map({'no':0 ,'yes': 1})
binary_df['loan']=binary_df['loan'].map({'no':0 ,'yes': 1})
binary_df['is_telephone_contact']=binary_df['is_telephone_contact'].map({'no':0 ,'yes': 1})


In [None]:
binary_df['default']=binary_df['default'].astype(int)
binary_df['housing']=binary_df['housing'].astype(int)
binary_df['loan']=binary_df['loan'].astype(int)

In [None]:
binary_df.head()

Unnamed: 0,default,housing,loan,is_telephone_contact
0,0,0,0,1
1,0,0,0,1
2,0,1,0,1
3,0,0,0,1
4,0,0,1,1


In [None]:
# TO-DO: create `nominal_df` according to above explanations
nominal_df = pd.get_dummies(df[nominal_cols])
nominal_df.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single
0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [None]:
ordinal_df = df[ordinal_cols].copy()

In [None]:
ordinal_df['poutcome']=ordinal_df['poutcome'].map({'failure':-1 ,'nonexistent': 0,'success': 1})
ordinal_df['education']=ordinal_df['education'].map({'illiterate':0 ,'basic.4y': 1,'basic.6y': 2, 'basic.9y':3 ,'high.school': 4,'professional.course': 5,'university.degree':6})
ordinal_df['month']=ordinal_df['month'].map({'feb':0 ,'jan': 1,'mar': 2, 'apr':3 ,'may': 4,'jun': 5,'jul':6,'aug':7 ,'sep': 8,'oct': 9, 'nov':10 ,'dec': 11})
ordinal_df['day_of_week']=ordinal_df['day_of_week'].map({'mon':0 ,'tue': 1,'wed': 2, 'thu':3 ,'fri': 4,'sat': 5,'sun':6 })

In [None]:
# TO-DO: create `ordinal_df` accordiing to above explanations
ordinal_df.head()

Unnamed: 0,education,month,day_of_week,poutcome
0,1,4,0,0
1,4,4,0,0
2,4,4,0,0
3,2,4,0,0
4,4,4,0,0


In [None]:
new_df = df[continuous_cols].join([binary_df, nominal_df, ordinal_df])
new_df

Unnamed: 0,age,duration,campaign,pdays,previous,default,housing,loan,is_telephone_contact,job_admin.,...,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,education,month,day_of_week,poutcome
0,56,261,1,999,0,0,0,0,1,0,...,0,0,0,0,1,0,1,4,0,0
1,57,149,1,999,0,0,0,0,1,0,...,0,0,0,0,1,0,4,4,0,0
2,37,226,1,999,0,0,1,0,1,0,...,0,0,0,0,1,0,4,4,0,0
3,40,151,1,999,0,0,0,0,1,1,...,0,0,0,0,1,0,2,4,0,0
4,56,307,1,999,0,0,0,1,1,0,...,0,0,0,0,1,0,4,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,334,1,999,0,0,1,0,0,0,...,0,0,0,0,1,0,5,10,4,0
41184,46,383,1,999,0,0,0,0,0,0,...,0,0,0,0,1,0,5,10,4,0
41185,56,189,2,999,0,0,1,0,0,0,...,0,0,0,0,1,0,6,10,4,0
41186,44,442,1,999,0,0,0,0,0,0,...,0,1,0,0,1,0,5,10,4,0


In [None]:
# preprocessing

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for i in ['age', 'duration', 'campaign', 'pdays', 'education',
       'month', 'day_of_week']:
  new_df[[i]] = scaler.fit_transform(new_df[[i]])

In [None]:
target_variable = 'y'
model = AutoML(task='classification', time_budget=120, verbose=0)
model.fit(df.drop(target_variable, axis=1), df[target_variable])

df_test = df.sample(frac=0.3, random_state=313)
y_pred = model.predict(df_test.drop(target_variable, axis=1))

f1score = f1_score(df_test[target_variable], y_pred, pos_label='yes')*100
print(f'performance of model is {f1score} %')

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


performance of model is 58.457711442786064 %


In [None]:
y=  df[['y']]

In [None]:
x = new_df

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=True)

In [None]:
# modeling
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3,p=2,weights='distance')
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

  return self._fit(X, y)
Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7fdc4524e550>
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/threadpoolctl.py", line 584, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.8/dist-packages/threadpoolctl.py", line 725, in _make_controller_from_path
    lib_controller = lib_controller_class(
  File "/usr/local/lib/python3.8/dist-packages/threadpoolctl.py", line 842, in __init__
    super().__init__(**kwargs)
  File "/usr/local/lib/python3.8/dist-packages/threadpoolctl.py", line 810, in __init__
    self._dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
  File "/usr/lib/python3.8/ctypes/__init__.py", line 373, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: /usr/local/lib/python3.8/dist-packages/numpy.libs/libopenblasp-r0-2d23e62b.3.1

In [None]:
# evaluate model on train data
from sklearn.metrics import f1_score

prediction =f1_score(y_val, y_pred, average='weighted')
prediction

0.8841364846185017

In [None]:
# evaluate model on tast data
y_pred = model.predict(X_test)
prediction =f1_score(y_test, y_pred, average='weighted')
prediction

0.8833311947496569