In [4]:
import numpy as np
import logging
import pandas as pd
import sys
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
from scipy.stats import chi2_contingency
import mlflow
import mlflow.pytorch
from sklearn.model_selection import train_test_split

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

# Custom imports (assuming these paths are correct and the scripts exist)
sys.path.append("/Users/rohit.jishtu/Documents/GitHub/NewMachine/Projects/MLOps/CompleteML/ModelBuild")
from LoadData import *
from Model import *


sys.path.append("/Users/rohit.jishtu/Documents/GitHub/NewMachine/Projects/StatsFunctions")
from Corr import *
from Stats import *




# Step 1: Load Data-------------------------------------------------------------------
file_path='../../ML Projects/Project 7 - Bank Marketing Data/bank/bank.csv'
def load_data(file_path):
    try:
        data = pd.read_csv(file_path, sep=';')
        if data is None:
            raise ValueError("Failed to load data.")
        logger.info(f"Data loaded successfully with shape: {data.shape}")
        return data
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        sys.exit(1)

# Load the data
data = load_data(file_path)

# Step 2: Define Target Column------------------------------------------------------------------------------------------------------------------------------------
data['Target'] = data['y'].apply(lambda x: 1 if x == 'yes' else 0)
logger.info(f"Target distribution:\n{data['Target'].value_counts() / data.shape[0]}")

# Step 3: Run Basic Statistics------------------------------------------------------------------------------------------------------------------------------------
def calculate_basic_stats(data, data_type):
    return CalBasicStats(data, data_type)

outdf_numeric = calculate_basic_stats(data, 'numeric')
outdf_nonnumeric = calculate_basic_stats(data, 'Non-numeric')


print(f'\n# Step 3: Run Basic Statistics COMPLETE\n')
# Step 4: Feature Selection------------------------------------------------------------------------------------------------------------------------------------

keys=['y']
Selected =[]


outdf_nonnumeric=outdf_nonnumeric[(outdf_nonnumeric['UniqueCounts']>1)]
Columns2=outdf_nonnumeric['ColumnName'].to_list()


# 1. categorical Feature Selction Stratergy 
selected_cat_features = []
for feature in Columns2:
    contingency_table = pd.crosstab(data[feature], data['y'])
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
    if p_value < 0.05:
        selected_cat_features.append(feature)


# 2 . Numerical Feature Selection Category 

outdf_numeric=outdf_numeric[(outdf_numeric['%Zero']<40) & (outdf_numeric['%Missing'] < 30)]
Selected.extend(outdf_numeric['ColumnName'].to_list())

#  Correlation 
sys.path.append("/Users/rohit.jishtu/Documents/GitHub/NewMachine/Projects/StatsFunctions/")
from Corr import * 

Selected.append('Target')
CorrelatedList,CorrelationDF=CorrAttributesList(data[Selected],'Target',0.02)
CorrelatedList=CorrelatedList.to_list()


# remove all feature from Selected list which are not in Corelation list 
for column in Selected:
    if column not in CorrelatedList:
        print(f'{column=} is not crossing correlation limit')
        Selected.remove(column)


# print(data.columns)
# print(Selected)

# Modifying data based on selected.
TrainingData=data[Selected]
for column in selected_cat_features:
    dummy_variables = pd.get_dummies(data[column], drop_first=True)
    prefix = str(column+'_')
    dummy_variables = dummy_variables.astype(int)
    dummy_variables = dummy_variables.add_prefix(prefix)
    TrainingData = pd.concat([TrainingData , dummy_variables], axis=1)


print(TrainingData.columns)  # Verify the merged data






# Step 3: Run Basic Statistics COMPLETE

column='day' is not crossing correlation limit
Index(['age', 'balance', 'duration', 'campaign', 'pdays', 'Target',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'education_secondary',
       'education_tertiary', 'education_unknown', 'housing_yes', 'loan_yes',
       'contact_telephone', 'contact_unknown', 'month_aug', 'month_dec',
       'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep', 'poutcome_other',
       'poutcome_success', 'poutcome_unknown', 'y_yes'],
      dtype='object')


In [7]:

# Step 5: Model Building and Training------------------------------------------------------------------------------------------------------------------------------------

# Prepare the features and target
Targets = ['Target','y','y_yes']
FeatureList= [x for x in TrainingData.columns if x not in Targets]
X=TrainingData[FeatureList] 
Y=data['Target'] 


In [8]:
X

Unnamed: 0,age,balance,duration,campaign,pdays,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,79,1,-1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,33,4789,220,1,339,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,35,1350,185,1,330,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,30,1476,199,4,-1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,59,0,226,1,-1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,-333,329,5,-1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4517,57,-3313,153,1,-1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4518,57,295,151,11,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4519,28,1137,129,4,211,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:

X = X.fillna(0)

X = X.to_numpy(dtype=np.float32)  # Convert features to NumPy array
Y = Y.to_numpy(dtype=np.float32)