In [3]:
import pandas as pd

import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score



In [4]:
# Load the data
train_df = pd.read_csv(r"C:\Users\plahare\Downloads\Numora_Demo\archive (2)\train.csv", sep=";")
test_df = pd.read_csv(r"C:\Users\plahare\Downloads\Numora_Demo\archive (2)\test.csv", sep=";")


In [5]:
train_df.rename(columns={'y': 'deposit'}, inplace=True)
test_df.rename(columns={'y': 'deposit'}, inplace=True)


In [6]:
## check the null values
train_df.isnull().sum()


age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [7]:
categorial_to_onehot = ['job','marital','contact','poutcome']
categorial_to_labelencoder = ["education","housing","deposit","default","loan","month"]


In [8]:
new_df = train_df.copy()
# Encode categorical features using OneHotEncoder
# convert categorical data into one-hot encoded representation
for i in categorial_to_onehot:
    onehot_encoder = OneHotEncoder(sparse_output=False)
    onehot_encoded = onehot_encoder.fit_transform(new_df[[i]])
    onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out([i]))
    
    new_df = pd.concat([new_df, onehot_encoded_df], axis=1)

train_df = new_df.copy()


In [9]:
# Encode categorical features using LabelEncoder
# convert categorical data into numerical labels
encoder = LabelEncoder()
for categorial_label in categorial_to_labelencoder:
    train_df[categorial_label] = encoder.fit_transform(train_df[categorial_label])



In [10]:
# List of columns to drop
columns_to_drop = ['job', 'marital', 'contact', 'poutcome']

# Create new_data by dropping the specified columns
train_df = train_df.drop(columns_to_drop, axis=1)



In [11]:
# Encode categorical features using OneHotEncoder
# convert categorical data into one-hot encoded representation
for i in categorial_to_onehot:
    onehot_encoder = OneHotEncoder(sparse_output=False)
    onehot_encoded = onehot_encoder.fit_transform(test_df[[i]])
    onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out([i]))
    
    test_df = pd.concat([test_df, onehot_encoded_df], axis=1)



In [12]:
# Encode categorical features using LabelEncoder
# convert categorical data into numerical labels
encoder = LabelEncoder()
for categorial_label in categorial_to_labelencoder:
    test_df[categorial_label] = encoder.fit_transform(test_df[categorial_label])



In [13]:
# List of columns to drop
columns_to_drop = ['job', 'marital', 'contact', 'poutcome']

# Create new_data by dropping the specified columns
test_df = test_df.drop(columns_to_drop, axis=1)



In [14]:
# List of numerical features
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [15]:
from scipy.stats import zscore
# CREAT FUNCTION
def remove_outliers(data, numerical_features, threshold=3):

    z_scores = data[numerical_features].apply(zscore)
    outlier_indices = (z_scores > threshold).any(axis=1)
    cleaned_df = data[~outlier_indices]

    return cleaned_df

# Usage example:
cleaned_df = remove_outliers(train_df, numerical_features)
print("Shape of original  dataset:", train_df.shape)
print("Shape of cleaned dataset:", cleaned_df.shape)
prc = ((train_df.shape[0] - cleaned_df.shape[0]) /train_df.shape[0]) *100
print(f'we loss: {round(prc,2)}% from the data')


Shape of original  dataset: (45211, 35)
Shape of cleaned dataset: (40210, 35)
we loss: 11.06% from the data


In [16]:
# Extract features and target variable
X = train_df[['age', 'education', 'default', 'balance', 'housing', 'loan', 'day',
       'month', 'duration', 'campaign', 'pdays', 'previous',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'contact_cellular', 'contact_telephone', 'contact_unknown',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown']]
y = train_df['deposit'].values


In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)


In [18]:
# Standardize numerical features
scaler = StandardScaler()
X_train_scaled= scaler.fit_transform(X_train[numerical_features])
X_test_scaled= scaler.transform(X_test[numerical_features])


In [19]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest model
random_forest_model = RandomForestClassifier()

# Train the model
random_forest_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_random_forest = random_forest_model.predict(X_test_scaled)

# Calculate accuracy
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
print(f'Random Forest Accuracy: {accuracy_random_forest:.2f}')


Random Forest Accuracy: 0.90


In [20]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize Gradient Boosting Classifier model
gradient_boosting_model = GradientBoostingClassifier()

# Train the model
gradient_boosting_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_gradient_boosting = gradient_boosting_model.predict(X_test_scaled)

# Calculate accuracy
accuracy_gradient_boosting = accuracy_score(y_test, y_pred_gradient_boosting)
print(f'Gradient Boosting Classifier Accuracy: {accuracy_gradient_boosting:.2f}')


Gradient Boosting Classifier Accuracy: 0.90


In [26]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from scipy.stats import zscore

# Load the data
train_df = pd.read_csv(r"C:\Users\plahare\Downloads\Numora_Demo\archive (2)\train.csv", sep=";")
train_df.rename(columns={'y': 'deposit'}, inplace=True)

# Define categorical features
categorical_to_onehot = ['job', 'marital', 'contact', 'poutcome']
categorical_to_labelencoder = ["education", "housing", "deposit", "default", "loan", "month"]

# Encode categorical features using OneHotEncoder
new_df = train_df.copy()
for i in categorical_to_onehot:
    onehot_encoder = OneHotEncoder(sparse_output=False)
    onehot_encoded = onehot_encoder.fit_transform(new_df[[i]])
    onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out([i]))
    new_df = pd.concat([new_df, onehot_encoded_df], axis=1)

train_df = new_df.copy()

# Encode categorical features using LabelEncoder
encoder = LabelEncoder()
for categorical_label in categorical_to_labelencoder:
    train_df[categorical_label] = encoder.fit_transform(train_df[categorical_label])

# List of columns to drop
columns_to_drop = ['job', 'marital', 'contact', 'poutcome']

# Create new_data by dropping the specified columns
train_df = train_df.drop(columns_to_drop, axis=1)

# Define numerical features
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Function to remove outliers
def remove_outliers(data, numerical_features, threshold=3):
    z_scores = data[numerical_features].apply(zscore)
    outlier_indices = (z_scores > threshold).any(axis=1)
    cleaned_df = data[~outlier_indices]
    return cleaned_df

# Usage example:
cleaned_df = remove_outliers(train_df, numerical_features)
print("Shape of original dataset:", train_df.shape)
print("Shape of cleaned dataset:", cleaned_df.shape)
prc = ((train_df.shape[0] - cleaned_df.shape[0]) /train_df.shape[0]) *100
print(f'we loss: {round(prc,2)}% from the data')

# Extract features and target variable
X = cleaned_df.drop('deposit', axis=1)
y = cleaned_df['deposit'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_features])
X_test_scaled = scaler.transform(X_test[numerical_features])

# Initialize Random Forest model
random_forest_model = RandomForestClassifier()

# Train the Random Forest model
random_forest_model.fit(X_train_scaled, y_train)

# Save Random Forest model with MLflow
with mlflow.start_run() as run:
    mlflow.sklearn.log_model(random_forest_model, "random_forest_model")
    random_forest_model_uri = f"runs:/{run.info.run_id}/random_forest_model"

# Save Gradient Boosting model with MLflow
with mlflow.start_run() as run:
    mlflow.sklearn.log_model(gradient_boosting_model, "gradient_boosting_model")
    gradient_boosting_model_uri = f"runs:/{run.info.run_id}/gradient_boosting_model"


Shape of original dataset: (45211, 35)
Shape of cleaned dataset: (40210, 35)
we loss: 11.06% from the data


In [29]:
import mlflow
import pandas as pd
import datetime
import os

# Get current username
model_creator = os.getlogin()

# Get MLflow run creation time
creation_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# Start main MLflow run
with mlflow.start_run() as main_run:
    # Load Random Forest model
    random_forest_model = mlflow.sklearn.load_model(random_forest_model_uri)
    
    # Start nested run for Random Forest model
    with mlflow.start_run(run_name="Random Forest", nested=True):
        # Log metadata for Random Forest model
        mlflow.log_param("ModelName", "Random Forest")
        mlflow.log_param("ModelType", "Classification")
        mlflow.log_param("ModelCreationDate", creation_time)
        mlflow.log_param("ModelCreator", model_creator)
        mlflow.log_param("ModelLatestVersion", "1")  # Change this to the actual version
        mlflow.log_param("ModelParameters", str(random_forest_model.get_params()))
        mlflow.log_param("ModelEnvironment", "Production")
    
    # Load Gradient Boosting model
    gradient_boosting_model = mlflow.sklearn.load_model(gradient_boosting_model_uri)
    
    # Start nested run for Gradient Boosting model
    with mlflow.start_run(run_name="Gradient Boosting", nested=True):
        # Log metadata for Gradient Boosting model
        mlflow.log_param("ModelName", "Gradient Boosting")
        mlflow.log_param("ModelType", "Classification")
        mlflow.log_param("ModelCreationDate", creation_time)
        mlflow.log_param("ModelCreator", model_creator)
        mlflow.log_param("ModelLatestVersion", "1")  # Change this to the actual version
        mlflow.log_param("ModelParameters", str(gradient_boosting_model.get_params()))
        mlflow.log_param("ModelEnvironment", "Production")

    # Extract metadata and create DataFrame
    metadata = {
        'ModelName': ["Random Forest", "Gradient Boosting"],
        'ModelType': ["Classification", "Classification"],
        'ModelCreationDate': [creation_time, creation_time],
        'ModelCreator': [model_creator, model_creator],
        'ModelLatestVersion': ["1", "1"],  # Change this to the actual version
        'ModelParameters': [random_forest_model.get_params(), gradient_boosting_model.get_params()],
        'ModelEnvironment': ["Production", "Production"]
    }
    metadata_df = pd.DataFrame(metadata)

    # Save metadata to CSV
    metadata_df.to_csv("model_metadata_updated.csv", index=False)
