In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.pyfunc


In [None]:
data = pd.read_csv(r"D:\emi_prediction_app\emi_prediction_dataset.csv",nrows=1000)

# data=data.head(200000)
numerical_cols = []
categorical_cols = []

for col in data.columns:
    if data[col].dtype in ['int64', 'float64']:
        numerical_cols.append(col)
    else:
        categorical_cols.append(col)


In [3]:
for col in ['age','monthly_salary','bank_balance']:
    if col in categorical_cols:
        categorical_cols.remove(col)
        numerical_cols.append(col)


In [4]:
for col in numerical_cols:
    data[col] = data[col].astype(str).str.extract(r'(\d+\.?\d*)')
    data[col] = data[col].astype(float)
    data[col].fillna(data[col].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

In [5]:
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


In [6]:
education_order = ['High School','Graduate','Post Graduate','Professional']
oe = OrdinalEncoder(categories=[education_order])
data['education'] = oe.fit_transform(data[['education']])


In [7]:
data['gender'].unique()

array(['Female', 'Male', 'female', 'male', 'M', 'MALE', 'F', 'FEMALE'],
      dtype=object)

In [8]:
data['gender']=data['gender'].map({'Female':'F','Male':'M','female':'F','male':'M','MALE':'M','F':'F','FEMALE':'F'})

In [9]:
nominal_cols = categorical_cols.copy()
nominal_cols.remove('education')

data = pd.get_dummies(data, columns=nominal_cols, drop_first=True)


In [10]:
def classify(row):
    if row['emi_eligibility_High_Risk'] == 1:
        return 'High_Risk'
    elif row['emi_eligibility_Not_Eligible'] == 1:
        return 'Not_Eligible'
    else:
        return 'Eligible'

data['emi_eligibility'] = data.apply(classify, axis=1)


In [11]:
for col in data.columns:
    if data[col].dtype=='bool':
        data[col]=data[col].astype('int')

In [12]:
y_class = data['emi_eligibility']
y_reg = data['max_monthly_emi']

X = data.drop(columns=['emi_eligibility','emi_eligibility_High_Risk',
                       'emi_eligibility_Not_Eligible','max_monthly_emi'])


In [13]:
data.head()

Unnamed: 0,age,education,monthly_salary,years_of_employment,monthly_rent,family_size,dependents,school_fees,college_fees,travel_expenses,...,house_type_Own,house_type_Rented,existing_loans_Yes,emi_scenario_Education EMI,emi_scenario_Home Appliances EMI,emi_scenario_Personal Loan EMI,emi_scenario_Vehicle EMI,emi_eligibility_High_Risk,emi_eligibility_Not_Eligible,emi_eligibility
0,38.0,3.0,82600.0,0.9,20000.0,3.0,2.0,0.0,0.0,7200.0,...,0,1,1,0,0,1,0,0,1,Not_Eligible
1,38.0,1.0,21500.0,7.0,0.0,2.0,1.0,5100.0,0.0,1400.0,...,0,0,1,0,0,0,0,0,1,Not_Eligible
2,38.0,3.0,86100.0,5.8,0.0,4.0,3.0,0.0,0.0,10200.0,...,1,0,0,1,0,0,0,0,0,Eligible
3,58.0,0.0,66800.0,2.2,0.0,5.0,4.0,11400.0,0.0,6200.0,...,1,0,0,0,0,0,1,0,0,Eligible
4,48.0,3.0,57300.0,3.4,0.0,4.0,3.0,9400.0,21300.0,3600.0,...,0,0,0,0,1,0,0,0,1,Not_Eligible


In [14]:
data.columns

Index(['age', 'education', 'monthly_salary', 'years_of_employment',
       'monthly_rent', 'family_size', 'dependents', 'school_fees',
       'college_fees', 'travel_expenses', 'groceries_utilities',
       'other_monthly_expenses', 'current_emi_amount', 'credit_score',
       'bank_balance', 'emergency_fund', 'requested_amount',
       'requested_tenure', 'max_monthly_emi', 'gender_M',
       'marital_status_Single', 'employment_type_Private',
       'employment_type_Self-employed', 'company_type_MNC',
       'company_type_Mid-size', 'company_type_Small', 'company_type_Startup',
       'house_type_Own', 'house_type_Rented', 'existing_loans_Yes',
       'emi_scenario_Education EMI', 'emi_scenario_Home Appliances EMI',
       'emi_scenario_Personal Loan EMI', 'emi_scenario_Vehicle EMI',
       'emi_eligibility_High_Risk', 'emi_eligibility_Not_Eligible',
       'emi_eligibility'],
      dtype='object')

Classification

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_class_encoded = le.fit_transform(y_class)


In [31]:
y_class

0      Not_Eligible
1      Not_Eligible
2          Eligible
3          Eligible
4      Not_Eligible
           ...     
995    Not_Eligible
996        Eligible
997    Not_Eligible
998    Not_Eligible
999       High_Risk
Name: emi_eligibility, Length: 1000, dtype: object

In [30]:
y_class_encoded

array([2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0,
       2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2,
       2, 1, 2, 0, 2, 2, 0, 2, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2,
       2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2,
       2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 1, 2,
       2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2,
       2, 2, 2, 2, 1, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
       0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0,
       2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0,

In [16]:
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y_class_encoded)


Classification

In [17]:
X_train_c, X_test_c, y_train_c, y_test_c =train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


Regression

In [18]:
X_train_r, X_test_r, y_train_r, y_test_r =train_test_split(X, y_reg, test_size=0.2, random_state=42)


In [19]:
numerical_cols_c = [col for col in X_train_c.columns if X_train_c[col].dtype == 'float64']


In [20]:
numerical_cols_r = [col for col in X_train_r.columns if X_train_r[col].dtype == 'float64']


In [21]:
# scaling_col=[]
# for col in X.columns:
#     if X[col].dtype=='float64':
#         scaling_col.append(col)

In [22]:
scaler_c = StandardScaler()
X_train_c[numerical_cols_c] = scaler_c.fit_transform(X_train_c[numerical_cols_c])
X_test_c[numerical_cols_c] = scaler_c.transform(X_test_c[numerical_cols_c])

In [23]:
scaler_r = StandardScaler()
X_train_r[numerical_cols_r] = scaler_r.fit_transform(X_train_r[numerical_cols_r])
X_test_r[numerical_cols_r] = scaler_r.transform(X_test_r[numerical_cols_r])


In [32]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_absolute_error
import mlflow.sklearn

In [None]:
# ============================
# Classification MODEL
# ============================
mlflow.end_run()
# ----- LOGISTIC REGRESSION -----
with mlflow.start_run(run_name="Logistic_Regression"):
    log_clf = LogisticRegression(max_iter=500)
    log_clf.fit(X_train_c, y_train_c)
    y_pred = log_clf.predict(X_test_c)
    acc=accuracy_score(y_test_c, y_pred)
    f1=f1_score(y_test_c, y_pred, average='weighted')
    print("Accuracy:",acc)
    print("F1-score:",f1)
    mlflow.log_metric("accuracy", accuracy_score(y_test_c, y_pred))
    mlflow.log_metric("f1", f1_score(y_test_c, y_pred, average='weighted'))
    mlflow.sklearn.log_model(log_clf, "logistic_model", registered_model_name="Logistic_Classification_Model")




Accuracy: 0.9207708779443254
F1-score: 0.9210597771688036


Registered model 'Logistic_Classification_Model' already exists. Creating a new version of this model...
Created version '2' of model 'Logistic_Classification_Model'.


In [36]:
mlflow.end_run()
with mlflow.start_run(run_name="XGB_Classifier"):
    xgb_clf = XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='mlogloss')
    xgb_clf.fit(X_train_c, y_train_c)
    y_pred = xgb_clf.predict(X_test_c)
    acc= accuracy_score(y_test_c, y_pred)
    f1= f1_score(y_test_c, y_pred, average='weighted')
    print("Accuracy:",acc)
    print("F1-score:",f1)
    mlflow.log_metric("accuracy", accuracy_score(y_test_c, y_pred))
    mlflow.log_metric("f1", f1_score(y_test_c, y_pred, average='weighted'))
    mlflow.sklearn.log_model(xgb_clf, "xgb_classifier_model", registered_model_name="XGB_Classification_Model")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9700214132762313
F1-score: 0.9700122757291482


Registered model 'XGB_Classification_Model' already exists. Creating a new version of this model...
Created version '2' of model 'XGB_Classification_Model'.


In [None]:


from sklearn.ensemble import RandomForestClassifier
import mlflow
import mlflow.sklearn
mlflow.end_run()
mlflow.set_experiment("emi_prediction_classification1")

with mlflow.start_run(run_name="RF_Classifier"):

    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    clf.fit(X_train_c, y_train_c)

    y_pred_c = clf.predict(X_test_c)

    acc = accuracy_score(y_test_c, y_pred_c)
    f1 = f1_score(y_test_c, y_pred_c, average='weighted')

    print("Accuracy:", acc)
    print("F1 Score:", f1)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Save model
    mlflow.sklearn.log_model(
        clf,
        "classification_model",
        registered_model_name="EMI_Classification_Model"
    )




Accuracy: 0.9507494646680942
F1 Score: 0.9508398412250978


Registered model 'EMI_Classification_Model' already exists. Creating a new version of this model...
Created version '3' of model 'EMI_Classification_Model'.


In [39]:
import joblib

In [40]:
joblib.dump(clf,"clf_model.pkl")
# print("clf model pickled")

['clf_model.pkl']

In [27]:
# ============================
# Regression MODEL
# ============================

from sklearn.ensemble import RandomForestRegressor
import mlflow
import mlflow.sklearn

mlflow.set_experiment("emi_prediction_regression1")

with mlflow.start_run(run_name="RF_Regression"):

    reg = RandomForestRegressor(n_estimators=200, random_state=42)
    reg.fit(X_train_r, y_train_r)

    y_pred_r = reg.predict(X_test_r)

    r2 = r2_score(y_test_r, y_pred_r)
    mae = mean_absolute_error(y_test_r, y_pred_r)

    print("R2 Score:", r2)
    print("MAE:", mae)

    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mae", mae)

    # Save regression model
    mlflow.sklearn.log_model(
        reg,
        "regression_model",
        registered_model_name="EMI_Regression_Model"
    )




R2 Score: 0.7902869499943763
MAE: 2171.660957


Registered model 'EMI_Regression_Model' already exists. Creating a new version of this model...
Created version '3' of model 'EMI_Regression_Model'.


In [41]:
joblib.dump(reg, "emi_reg_model.pkl")

['emi_reg_model.pkl']

In [37]:
mlflow.end_run()
with mlflow.start_run(run_name="Linear_Regression"):
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_r, y_train_r)
    y_pred = lin_reg.predict(X_test_r)
    print("r2", r2_score(y_test_r, y_pred))
    print("mae", mean_absolute_error(y_test_r, y_pred))
    mlflow.log_metric("r2", r2_score(y_test_r, y_pred))
    mlflow.log_metric("mae", mean_absolute_error(y_test_r, y_pred))
    mlflow.sklearn.log_model(lin_reg, "linear_model", registered_model_name="Linear_Regression_Model")




r2 0.6844485754291001
mae 2966.62863560372


Successfully registered model 'Linear_Regression_Model'.
Created version '1' of model 'Linear_Regression_Model'.


In [38]:
mlflow.end_run()
with mlflow.start_run(run_name="XGB_Regressor"):
    xgb_reg = XGBRegressor(n_estimators=200)
    xgb_reg.fit(X_train_r, y_train_r)
    y_pred = xgb_reg.predict(X_test_r)
    print("r2", r2_score(y_test_r, y_pred))
    print("mae", mean_absolute_error(y_test_r, y_pred))
    mlflow.log_metric("r2", r2_score(y_test_r, y_pred))
    mlflow.log_metric("mae", mean_absolute_error(y_test_r, y_pred))
    mlflow.sklearn.log_model(xgb_reg, "xgb_regressor_model", registered_model_name="XGB_Regression_Model")



r2 0.8362091734809063
mae 1912.448798762512


Successfully registered model 'XGB_Regression_Model'.
Created version '1' of model 'XGB_Regression_Model'.


In [28]:
X.columns

Index(['age', 'education', 'monthly_salary', 'years_of_employment',
       'monthly_rent', 'family_size', 'dependents', 'school_fees',
       'college_fees', 'travel_expenses', 'groceries_utilities',
       'other_monthly_expenses', 'current_emi_amount', 'credit_score',
       'bank_balance', 'emergency_fund', 'requested_amount',
       'requested_tenure', 'gender_M', 'marital_status_Single',
       'employment_type_Private', 'employment_type_Self-employed',
       'company_type_MNC', 'company_type_Mid-size', 'company_type_Small',
       'company_type_Startup', 'house_type_Own', 'house_type_Rented',
       'existing_loans_Yes', 'emi_scenario_Education EMI',
       'emi_scenario_Home Appliances EMI', 'emi_scenario_Personal Loan EMI',
       'emi_scenario_Vehicle EMI'],
      dtype='object')

In [29]:
# import streamlit as st
# import pandas as pd
# import numpy as np
# import mlflow.pyfunc

# st.title("EMI Eligibility Prediction App")

# # Load both models
# clf_model = mlflow.pyfunc.load_model("models:/EMI_Classification_Model/1")
# # reg_model = mlflow.pyfunc.load_model("runs:/058c8021ff0c47c7ae9d06ca92a0defd/regression_model")

# # User input form
# age = st.number_input("Age", 18, 80)
# salary = st.number_input("Monthly Salary", 1000, 500000)
# bank_balance = st.number_input("Bank Balance", 0, 10000000)
# education = st.selectbox("Education", ["High School", "Graduate", "Post Graduate", "Professional"])
# gender = st.selectbox("Gender", ["M", "F"])

# # Convert manually (same as training!)
# edu_map = {'High School':0, 'Graduate':1, 'Post Graduate':2, 'Professional':3}

# input_dict = {
#     "age": age,
#     "monthly_salary": salary,
#     "bank_balance": bank_balance,
#     "education": edu_map[education],
#     "gender_M": 1 if gender=="M" else 0
# }

# # Convert to DataFrame
# df = pd.DataFrame([input_dict])

# if st.button("Check Eligibility"):
#     pred_class = clf_model.predict(df)[0]
#     # pred_emi = reg_model.predict(df)[0]

#     st.subheader("Prediction Result:")
#     st.write("**Eligibility:**", pred_class)
#     # st.write("**Max EMI Allowed:** ₹", int(pred_emi))
