In [30]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,26110,56,admin.,married,unknown,no,1933,no,no,telephone,19,nov,44,2,-1,0,unknown,no
1,40576,31,unknown,married,secondary,no,3,no,no,cellular,20,jul,91,2,-1,0,unknown,no
2,15320,27,services,married,secondary,no,891,yes,no,cellular,18,jul,240,1,-1,0,unknown,no
3,43962,57,management,divorced,tertiary,no,3287,no,no,cellular,22,jun,867,1,84,3,success,yes
4,29842,31,technician,married,secondary,no,119,yes,no,cellular,4,feb,380,1,-1,0,unknown,no


In [4]:
def encode(df,column,d):
    c = df[column].astype('category')
    d_tem  = dict(enumerate(c.cat.categories))
    d[column] = d_tem
    df[column] = df[column].astype('category').cat.codes
    return df

In [5]:
df.columns

Index(['ID', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'subscribed'],
      dtype='object')

In [6]:
cate = ['job','marital','education','contact','month','default','housing', 'loan','subscribed']

In [7]:
d = {}
for c in cate:
    df = encode(df,c,d)
df.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,26110,56,0,1,3,0,1933,0,0,1,19,9,44,2,-1,0,unknown,0
1,40576,31,11,1,1,0,3,0,0,0,20,5,91,2,-1,0,unknown,0
2,15320,27,7,1,1,0,891,1,0,0,18,5,240,1,-1,0,unknown,0
3,43962,57,4,0,2,0,3287,0,0,0,22,6,867,1,84,3,success,1
4,29842,31,9,1,1,0,119,1,0,0,4,3,380,1,-1,0,unknown,0


In [8]:
print(d)

{'job': {0: 'admin.', 1: 'blue-collar', 2: 'entrepreneur', 3: 'housemaid', 4: 'management', 5: 'retired', 6: 'self-employed', 7: 'services', 8: 'student', 9: 'technician', 10: 'unemployed', 11: 'unknown'}, 'marital': {0: 'divorced', 1: 'married', 2: 'single'}, 'education': {0: 'primary', 1: 'secondary', 2: 'tertiary', 3: 'unknown'}, 'contact': {0: 'cellular', 1: 'telephone', 2: 'unknown'}, 'month': {0: 'apr', 1: 'aug', 2: 'dec', 3: 'feb', 4: 'jan', 5: 'jul', 6: 'jun', 7: 'mar', 8: 'may', 9: 'nov', 10: 'oct', 11: 'sep'}, 'default': {0: 'no', 1: 'yes'}, 'housing': {0: 'no', 1: 'yes'}, 'loan': {0: 'no', 1: 'yes'}, 'subscribed': {0: 'no', 1: 'yes'}}


In [9]:
df = df[df['job'] != 11]
df = df[df['education'] != 3]
df = df[df['contact'] != 2]

In [10]:
df.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
2,15320,27,7,1,1,0,891,1,0,0,18,5,240,1,-1,0,unknown,0
3,43962,57,4,0,2,0,3287,0,0,0,22,6,867,1,84,3,success,1
4,29842,31,9,1,1,0,119,1,0,0,4,3,380,1,-1,0,unknown,0
5,29390,33,4,2,2,0,0,1,0,0,2,3,116,3,-1,0,unknown,0
6,40444,56,5,1,1,0,1044,0,0,1,3,5,353,2,-1,0,unknown,1


In [12]:
features=['age','job','marital','education','default','balance','housing','loan','duration']

In [13]:
prediction = ['subscribed']

In [14]:
X = df[features]
Y = df[prediction]

In [15]:
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,duration
2,27,7,1,1,0,891,1,0,240
3,57,4,0,2,0,3287,0,0,867
4,31,9,1,1,0,119,1,0,380
5,33,4,2,2,0,0,1,0,116
6,56,5,1,1,0,1044,0,0,353


In [16]:
Y.head()

Unnamed: 0,subscribed
2,0
3,1
4,0
5,0
6,1


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [23]:
model = LogisticRegression()
model.fit(X_train, y_train.to_numpy().reshape(-1,))

In [24]:
y_pred = model.predict(X_test)

In [26]:
print(y_pred[2:13])

[0 0 0 0 0 1 0 0 0 0 0]


In [28]:
accuracy = accuracy_score(y_test.to_numpy().reshape(-1,), y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8613172541743971


In [31]:
joblib.dump(model, 'logistic_regression_model_0.8613172.pkl')

['logistic_regression_model_0.8613172.pkl']