In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import warnings

warnings.filterwarnings("ignore")


In [2]:
#Setting random seed for reproducibility
np.random.seed(0)

def generate_data(num_customers=1000):
    #Generating customer IDs from 1 to num_customers
    customer_ids = np.arange(1, num_customers + 1)
    #Generating random ages between 18 and 80 for each customer
    ages = np.random.randint(18, 81, size=num_customers)
    
    #Randomly assigning genders and locations to each customer
    genders = np.random.choice(['Male', 'Female'], size=num_customers)
    locations = np.random.choice(['Urban', 'Suburban', 'Rural'], size=num_customers)
    
    #Generating random incomes with a normal distribution with mean 50000 and standard deviation 20000
    incomes = np.random.normal(50000, 20000, size=num_customers).astype(int)
    products = np.random.choice(['Savings Account', 'Credit Card', 'Mortgage', 'Investment Products'],
                                 size=num_customers)
    
    data = pd.DataFrame({
        'Customer ID': customer_ids,
        'Age': ages,
        'Gender': genders,
        'Location': locations,
        'Income': incomes,
        'Products Purchased/Interested In': products
    })
    
    return data

In [3]:
#Augmenting the original dataset by a factor of 5, by creating additional synthetic datasets
def augment_data(original_data, factor=5):
    augmented_data = []
    for _ in range(factor):
        augmented_data.append(generate_data(len(original_data)))
    #Returns augmented dataset containing original and synthetic data
    return pd.concat([original_data] + augmented_data, ignore_index=True)

In [4]:
def preprocess_data(data):
    #Converting categorical variables into numerical representations
    data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
    data['Location'] = data['Location'].map({'Urban': 0, 'Suburban': 1, 'Rural': 2})
    return data

In [5]:
def train_and_evaluate_model(X_train, X_test, y_train, y_test, model, param_grid):
    #Training and evaluation of the specified model using grid search for hyperparameter tuning
    
    #3-fold cross validation
    #Dictionary specifying the hyperparameter grid for grid search.
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}\n")
    
    #Returns the best trained model after hyperparameter tuning
    return best_model

In [6]:
def predict_new_customer(model):
    #prompts the user to input information about a new customer and 
    #uses the trained model to predict the products that the new customer is likely to purchase
    age = int(input("Enter customer's age: "))
    gender = input("Enter customer's gender (Male/Female): ")
    location = input("Enter customer's location (Urban/Suburban/Rural): ")
    income_str = input("Enter customer's income: ")
    
    #If the input is valid, it creates a DataFrame with the new customer's information and 
    #passes this to the model for prediction
    if income_str.isdigit():
        income = int(income_str)
    else:
        print("Invalid input for income.")
        return

    gender_map = {'Male': 0, 'Female': 1}
    location_map = {'Urban': 0, 'Suburban': 1, 'Rural': 2}
    gender_numeric = gender_map.get(gender.capitalize(), -1)
    location_numeric = location_map.get(location.capitalize(), -1)

    if gender_numeric == -1 or location_numeric == -1:
        print("Invalid input for gender or location.")
        return
    else:
        new_customer_data = pd.DataFrame({
            'Age': [age],
            'Gender': [gender_numeric],
            'Location': [location_numeric],
            'Income': [income]
        })

        predicted_products = model.predict(new_customer_data)
        print("Predicted Products for New Customer:", predicted_products)

#FUTURE STEPS
#it doesn't handle cases where the input format is incorrect. 
#To improve it, better error handling for invalid inputs can be added

In [7]:
# Generate and augment data
data = generate_data()
augmented_data = augment_data(data)

In [8]:
# Save the dataset to a CSV file
augmented_data.to_csv("bank_dataset.csv", index=False)

In [9]:
# Load the dataset
data = pd.read_csv("bank_dataset.csv")

In [10]:
# Data preprocessing
data = preprocess_data(data)

In [11]:
# Split data into features (X) and target variable (y)
X = data.drop(columns=['Products Purchased/Interested In', 'Customer ID'])
y = data['Products Purchased/Interested In']

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Models with hyperparameter grids
models = {
    'Logistic Regression': (LogisticRegression(), {'C': [0.1, 1, 10]}),
    'Random Forest': (RandomForestClassifier(random_state=42), {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}),
    'Gradient Boosting': (GradientBoostingClassifier(random_state=42), {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10]}),
    'Support Vector Machine': (SVC(random_state=42), {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']})
}

In [14]:
# Train and evaluate models
for name, (model, param_grid) in models.items():
    print(f"\n{20*'='} {name} {20*'='}")
    best_model = train_and_evaluate_model(X_train, X_test, y_train, y_test, model, param_grid)
    predict_new_customer(best_model)


Best Hyperparameters: {'C': 0.1}
Accuracy: 0.242
Classification Report:
                     precision    recall  f1-score   support

        Credit Card       0.00      0.00      0.00       291
Investment Products       0.17      0.00      0.01       307
           Mortgage       0.00      0.00      0.00       311
    Savings Account       0.24      0.99      0.39       291

           accuracy                           0.24      1200
          macro avg       0.10      0.25      0.10      1200
       weighted avg       0.10      0.24      0.10      1200


Enter customer's age: 25
Enter customer's gender (Male/Female): male
Enter customer's location (Urban/Suburban/Rural): rural
Enter customer's income: 50000
Predicted Products for New Customer: ['Savings Account']

Best Hyperparameters: {'max_depth': 10, 'n_estimators': 200}
Accuracy: 0.239
Classification Report:
                     precision    recall  f1-score   support

        Credit Card       0.23      0.20      0.21       29