In [7]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

csv = 'bank.csv'
data = pd.read_csv(csv, delimiter=',')

data.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [8]:
# Encode categorical variables
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'deposit']
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Replace 'unknown' values with 'other' in relevant columns
data['job'] = data['job'].replace(0, 12)  # Assuming 0 is 'unknown' in the job column
data['education'] = data['education'].replace(0, 3)  # Assuming 0 is 'unknown' in the education column

# Display the first few rows of the preprocessed dataset
data.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,12,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3,1
1,56,12,1,1,0,45,0,0,2,5,8,1467,1,-1,0,3,1
2,41,9,1,1,0,1270,1,0,2,5,8,1389,1,-1,0,3,1
3,55,7,1,1,0,2476,1,0,2,5,8,579,1,-1,0,3,1
4,54,12,1,2,0,184,0,0,2,5,8,673,2,-1,0,3,1


In [9]:
# Split data into features (X) and target (y)
X = data.drop(columns=['deposit'])
y = data['deposit']

# Apply SMOTE to handle class imbalance (if needed)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)


In [10]:
# Define the parameter grid for Decision Tree
param_grid = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_


Best parameters found:  {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 20}


In [11]:

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)


Accuracy: 0.82

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.81      0.81      1742
           1       0.82      0.82      0.82      1782

    accuracy                           0.82      3524
   macro avg       0.82      0.82      0.82      3524
weighted avg       0.82      0.82      0.82      3524



In [12]:

y_pred = best_model.predict(X_test)

predictions = pd.DataFrame(X_test, columns=X.columns)
predictions['Predicted Deposit'] = y_pred
predictions['Predicted Deposit'] = predictions['Predicted Deposit'].map({0: 'No', 1: 'Yes'})  # Map to 'Yes'/'No'

print(predictions.head())


       age  job  marital  education  default  balance  housing  loan  contact  \
11470   44    7        0          1        0     4335        0     0        0   
8742    44    2        1          1        0      860        1     1        0   
1277    44    4        1          3        0      558        0     0        0   
9112    47   10        1          1        1        0        0     0        0   
10162   34    1        1          1        0     1257        1     0        2   

       day  month  duration  campaign  pdays  previous  poutcome  \
11470   21      8       525         1     -1         0         3   
8742     6      8        96         2    295         5         0   
1277    19      3       268         5     -1         0         3   
9112    18      9        81         2     -1         0         3   
10162   26      8       109         7     -1         0         3   

      Predicted Deposit  
11470               Yes  
8742                 No  
1277                 No  


In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def predict_purchase():
    categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

    label_encoders = {
        'job': LabelEncoder(),
        'marital': LabelEncoder(),
        'education': LabelEncoder(),
        'default': LabelEncoder(),
        'housing': LabelEncoder(),
        'loan': LabelEncoder(),
        'contact': LabelEncoder(),
        'month': LabelEncoder(),
        'poutcome': LabelEncoder()
    }

    input_data = {
        'age': int(input("Enter age: ")),
        'balance': float(input("Enter balance: ")),
        'day': int(input("Enter day of the month (1-31): ")),
        'duration': int(input("Enter duration of the call: ")),
        'campaign': int(input("Enter the number of contacts performed: ")),
        'pdays': int(input("Enter number of days since last contact: ")),
        'previous': int(input("Enter number of contacts before this campaign: ")),
        'job': input("Enter job (e.g., 'admin.', 'blue-collar', 'student', etc.): "),
        'marital': input("Enter marital status (e.g., 'married', 'single', 'divorced'): "),
        'education': input("Enter education (e.g., 'basic.4y', 'high.school', 'university.degree', etc.): "),
        'default': input("Enter default status (yes/no): "),
        'housing': input("Enter housing loan status (yes/no): "),
        'loan': input("Enter personal loan status (yes/no): "),
        'contact': input("Enter contact communication type (e.g., 'cellular', 'telephone', 'unknown'): "),
        'month': input("Enter month of last contact (e.g., 'jan', 'feb', 'mar', etc.): "),
        'poutcome': input("Enter outcome of the previous marketing campaign (e.g., 'failure', 'nonexistent', 'success'): "),
    }

    for col in categorical_columns:
        input_data[col] = label_encoders[col].fit_transform([input_data[col]])[0]

    input_df = pd.DataFrame([input_data], columns=X_train.columns)
    prediction = best_model.predict(pd.DataFrame([input_data], columns=X_train.columns))

    if prediction[0] == 1:
        print("Predictionn: The customer will subscribe to the product/service (Yes).")
    else:

        print("Prediction: The customer will not subscribe to the product/service (No).")

predict_purchase()


Enter age: 20
Enter balance: 5000
Enter day of the month (1-31): 14
Enter duration of the call: 200
Enter the number of contacts performed: 120
Enter number of days since last contact: 30
Enter number of contacts before this campaign: 12
Enter job (e.g., 'admin.', 'blue-collar', 'student', etc.): student
Enter marital status (e.g., 'married', 'single', 'divorced'): singlle
Enter education (e.g., 'basic.4y', 'high.school', 'university.degree', etc.): university.degree
Enter default status (yes/no): no
Enter housing loan status (yes/no): no
Enter personal loan status (yes/no): no
Enter contact communication type (e.g., 'cellular', 'telephone', 'unknown'): telephone
Enter month of last contact (e.g., 'jan', 'feb', 'mar', etc.): mar
Enter outcome of the previous marketing campaign (e.g., 'failure', 'nonexistent', 'success'): success
Prediction: The customer will subscribe to the product/service (Yes).
