In [34]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [35]:
df1 = pd.read_csv('train.csv',sep=";")
df2 = pd.read_csv('test.csv',sep=";")

In [36]:
# Combining both train Test Datasets
df = pd.concat([df1,df2],ignore_index=True)

In [37]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [38]:
df.shape

(49732, 17)

In [39]:
# Find null values in dataset
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [40]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,49732.0,49732.0,49732.0,49732.0,49732.0,49732.0,49732.0
mean,40.957472,1367.761562,15.816315,258.690179,2.766549,40.15863,0.576892
std,10.615008,3041.608766,8.31568,257.743149,3.099075,100.127123,2.254838
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1431.0,21.0,320.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [41]:
# Checking data types
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [42]:
x = df.drop(['y'],axis = 1)
y =df.y

In [43]:
y.head()

0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object

In [44]:
# Store all categorical (text) column into dataframe
categorical_columns = df.select_dtypes(include=['object']).columns

In [45]:
#Import labelencoder for converting string to number.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [46]:
#Converting Categorical columns in Numeric for training M.L. model
for col in categorical_columns:
    df[col]=le.fit_transform(df[col])

In [47]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [48]:
#Define independent variable into x and dependent into y.

#Independents variables

x1= df.drop(['y'],axis=1)
x1.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3


In [49]:
#Dependent variable
y1=df.y
y1.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int32

In [50]:
#Find best parameters using hyper parameter tuning

In [51]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [52]:
# Find the best parameters.
model_params = {
  
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [0,1, 5, 10]
        }
    }
}

In [53]:
scores = []

for model_name, mp in model_params.items():

    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False) 
    clf.fit(x1, y1)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })


In [54]:
df1 = pd.DataFrame(scores)
df1

Unnamed: 0,model,best_score,best_params
0,random_forest,0.840584,{'n_estimators': 10}


In [55]:
# Create a Pipeline to Encode Categorical Features Numerically and Train a Model

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

# Define the pipeline
clf = Pipeline([
    ('encodef', OneHotEncoder()),  # Encoding categorical features
    ('mod', RandomForestRegressor(n_estimators=10))  # Random Forest model
])

In [56]:
clf.fit(x,y1)

In [57]:
clf.score(x,y1)

0.8773006098042211

#### Our model achieves an accuracy of 87%.

In [58]:
columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 
           'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

new_data_points = [

    [59, 'admin.', 'married', 'secondary', 'no', 2343, 'yes', 'no', 'unknown', 5, 'may', 1042, 1, -1, 0, 'unknown']
]

input = pd.DataFrame(new_data_points, columns=columns)

In [59]:
# Test the model based on above input.

prediction= clf.predict(input)[0]

In [60]:
probability_percentage = prediction * 100
print("The probability of this lead converting into a customer is :",probability_percentage,'%')

The probability of this lead converting into a customer is : 70.0 %


In [73]:
columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 
           'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

new_data_points = [

    [35, 'unknown', 'married', 'unknown', 'no', 2500, 'yes', 'no', 'unknown', 5, 'may', 1048, 1, 1, 1, 'unknown']
]

input = pd.DataFrame(new_data_points, columns=columns)

In [74]:
# Test the model based on above input.

prediction= clf.predict(input)[0]

In [75]:
probability_percentage = prediction * 100
print("The probability of this lead converting into a customer is :",probability_percentage,'%')

The probability of this lead converting into a customer is : 10.0 %


In [76]:
columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 
           'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

new_data_points = [

    [35, 'unknown', 'married', 'unknown', 'no', 2500, 'yes', 'no', 'unknown', 5, 'may', 1048, 1, 1, 1, 'unknown']
]

input = pd.DataFrame(new_data_points, columns=columns)

In [77]:
# Test the model based on above input.

prediction= clf.predict(input)[0]

In [78]:
probability_percentage = prediction * 100
print("The probability of this lead converting into a customer is :",probability_percentage,'%')

The probability of this lead converting into a customer is : 10.0 %
