In [1]:
#Load the librarys
import pandas as pd #To work with dataset
import numpy as np #Math library
import seaborn as sns #Graph library that use matplot in background
import matplotlib.pyplot as plt #to plot some parameters in seaborn

#Importing the data
df_credit = pd.read_csv("german_credit_data.csv",index_col=0)

In [2]:
df_credit.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [3]:
#Searching for Missings,type of data and also known the shape of data
print(df_credit.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB
None


In [4]:
#Looking unique values
print(df_credit.nunique())
#Looking the data
print(df_credit.head())

Age                  53
Sex                   2
Job                   4
Housing               3
Saving accounts       4
Checking account      3
Credit amount       921
Duration             33
Purpose               8
Risk                  2
dtype: int64
   Age     Sex  Job Housing Saving accounts Checking account  Credit amount  \
0   67    male    2     own             NaN           little           1169   
1   22  female    2     own          little         moderate           5951   
2   49    male    1     own          little              NaN           2096   
3   45    male    2    free          little           little           7882   
4   53    male    2    free          little           little           4870   

   Duration              Purpose  Risk  
0         6             radio/TV  good  
1        48             radio/TV   bad  
2        12            education  good  
3        42  furniture/equipment  good  
4        24                  car   bad  


In [5]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=fe142a13876a648bae8a870b3e762ef38728f695b8bd9c59bebd41dc75251633
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [6]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col

In [7]:
# Load the libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, countDistinct, lit

# Create a SparkSession
spark = SparkSession.builder.appName("CreditDataProcessing").getOrCreate()

# Load the dataset
df_credit = spark.read.csv("german_credit_data.csv", header=True, inferSchema=True)

# Rename columns to small caps
df_credit = df_credit.select([col(c).alias(c.lower()) for c in df_credit.columns])

# Rename the column "savings account" to "savings_account"
df_credit = df_credit.withColumnRenamed("saving accounts", "savings_account")
df_credit = df_credit.withColumnRenamed("checking account", "checking_account")
df_credit = df_credit.withColumnRenamed("credit amount", "credit_amount")

In [8]:
df_credit.head()

Row(_c0=0, age=67, sex='male', job=2, housing='own', savings_account='NA', checking_account='little', credit_amount=1169, duration=6, purpose='radio/TV', risk='good')

In [9]:
# Print schema to see the data types of all columns
df_credit.printSchema()

# Access the data type of a specific column
checking_account_data_type = df_credit.schema["checking_account"].dataType
print("Data type of checking_account column:", checking_account_data_type)

root
 |-- _c0: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- job: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- savings_account: string (nullable = true)
 |-- checking_account: string (nullable = true)
 |-- credit_amount: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- purpose: string (nullable = true)
 |-- risk: string (nullable = true)

Data type of checking_account column: StringType()


In [10]:
from pyspark.sql.functions import col, when, lit

# Fill missing NA values in "savings_account" and "checking_account" columns
most_freq_savings = df_credit.select("savings_account").groupBy("savings_account").agg(count("*").alias("count")).orderBy(col("count").desc()).collect()[0]["savings_account"]
most_freq_checking = df_credit.select("checking_account").groupBy("checking_account").agg(count("*").alias("count")).orderBy(col("count").desc()).collect()[1]["checking_account"]

# Replace "NA" with "Missing" in the "savings_account" column
df_credit = df_credit.withColumn("savings_account", when(col("savings_account") == "NA", most_freq_savings)
                                  .otherwise(col("savings_account")))

# Replace "NA" with 0 in the "checking_account" column
df_credit = df_credit.withColumn("checking_account", when(col("checking_account") == "NA", most_freq_checking)
                                  .otherwise(col("checking_account")))

In [11]:
df_credit.show()

+---+---+------+---+-------+---------------+----------------+-------------+--------+-------------------+----+
|_c0|age|   sex|job|housing|savings_account|checking_account|credit_amount|duration|            purpose|risk|
+---+---+------+---+-------+---------------+----------------+-------------+--------+-------------------+----+
|  0| 67|  male|  2|    own|         little|          little|         1169|       6|           radio/TV|good|
|  1| 22|female|  2|    own|         little|        moderate|         5951|      48|           radio/TV| bad|
|  2| 49|  male|  1|    own|         little|          little|         2096|      12|          education|good|
|  3| 45|  male|  2|   free|         little|          little|         7882|      42|furniture/equipment|good|
|  4| 53|  male|  2|   free|         little|          little|         4870|      24|                car| bad|
|  5| 35|  male|  1|   free|         little|          little|         9055|      36|          education|good|
|  6| 53| 

In [12]:
# Create new features
df_credit = df_credit.withColumn("credit_amount_range", when(col("credit_amount") < 5000, "low")
                                  .when(col("credit_amount").between(5000, 10000), "medium")
                                  .otherwise("high"))
df_credit = df_credit.withColumn("age_group", when(col("age") < 30, "young")
                                  .when(col("age").between(30, 60), "middle-aged")
                                  .otherwise("senior"))

In [13]:
df_credit.show()

+---+---+------+---+-------+---------------+----------------+-------------+--------+-------------------+----+-------------------+-----------+
|_c0|age|   sex|job|housing|savings_account|checking_account|credit_amount|duration|            purpose|risk|credit_amount_range|  age_group|
+---+---+------+---+-------+---------------+----------------+-------------+--------+-------------------+----+-------------------+-----------+
|  0| 67|  male|  2|    own|         little|          little|         1169|       6|           radio/TV|good|                low|     senior|
|  1| 22|female|  2|    own|         little|        moderate|         5951|      48|           radio/TV| bad|             medium|      young|
|  2| 49|  male|  1|    own|         little|          little|         2096|      12|          education|good|                low|middle-aged|
|  3| 45|  male|  2|   free|         little|          little|         7882|      42|furniture/equipment|good|             medium|middle-aged|
|  4| 

In [14]:
# ML flow
!pip install mlflow --quiet
!pip install pyngrok --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.6/147.6 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.2/128.2 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.4/84.4 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [15]:
!mlflow

Usage: mlflow [OPTIONS] COMMAND [ARGS]...

Options:
  --version  Show the version and exit.
  --help     Show this message and exit.

Commands:
  artifacts    Upload, list, and download artifacts from an MLflow...
  db           Commands for managing an MLflow tracking database.
  deployments  Deploy MLflow models to custom targets.
  doctor       Prints out useful information for debugging issues with MLflow.
  experiments  Manage experiments.
  gc           Permanently delete runs in the `deleted` lifecycle stage.
  models       Deploy MLflow models locally.
  recipes      Run MLflow Recipes and inspect recipe results.
  run          Run an MLflow project from the given URI.
  runs         Manage runs.
  sagemaker    Serve models on SageMaker.
  server       Run the MLflow tracking server.


In [16]:
import mlflow
import mlflow.sklearn

mlflow.set_experiment('CreditRiskModel_Final')

2024/05/18 06:04:07 INFO mlflow.tracking.fluent: Experiment with name 'CreditRiskModel_Final' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/939845687885488354', creation_time=1716012247374, experiment_id='939845687885488354', last_update_time=1716012247374, lifecycle_stage='active', name='CreditRiskModel_Final', tags={}>

In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
import mlflow
import mlflow.sklearn
import sys
import os

In [18]:
def eval_metrics(actual, pred):
  acc=accuracy_score(actual, pred)
  conf=confusion_matrix(actual, pred)
  betaf=fbeta_score(actual, pred, beta=2)
  return acc, conf, betaf

In [19]:
##Converting spark dataframe df_credit to pandas dataframe
df_credit = df_credit.toPandas()
df_credit.drop('_c0', axis=1)


Unnamed: 0,age,sex,job,housing,savings_account,checking_account,credit_amount,duration,purpose,risk,credit_amount_range,age_group
0,67,male,2,own,little,little,1169,6,radio/TV,good,low,senior
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,medium,young
2,49,male,1,own,little,little,2096,12,education,good,low,middle-aged
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,medium,middle-aged
4,53,male,2,free,little,little,4870,24,car,bad,low,middle-aged
...,...,...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,little,1736,12,furniture/equipment,good,low,middle-aged
996,40,male,3,own,little,little,3857,30,car,good,low,middle-aged
997,38,male,2,own,little,little,804,12,radio/TV,good,low,middle-aged
998,23,male,2,free,little,little,1845,45,radio/TV,bad,low,young


In [20]:
df_credit=df_credit.drop('_c0', axis=1)

In [21]:
def one_hot_encoder(df, nan_as_category = False):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category, drop_first=True)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [22]:
df_credit.head()

Unnamed: 0,age,sex,job,housing,savings_account,checking_account,credit_amount,duration,purpose,risk,credit_amount_range,age_group
0,67,male,2,own,little,little,1169,6,radio/TV,good,low,senior
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,medium,young
2,49,male,1,own,little,little,2096,12,education,good,low,middle-aged
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,medium,middle-aged
4,53,male,2,free,little,little,4870,24,car,bad,low,middle-aged


In [23]:
#Purpose to Dummies Variable
df_credit = df_credit.merge(pd.get_dummies(df_credit.purpose, drop_first=True, prefix='purpose'), left_index=True, right_index=True)
#Sex feature in dummies
df_credit = df_credit.merge(pd.get_dummies(df_credit.sex, drop_first=True, prefix='sex'), left_index=True, right_index=True)
# Housing get dummies
df_credit = df_credit.merge(pd.get_dummies(df_credit.housing, drop_first=True, prefix='housing'), left_index=True, right_index=True)
# Housing get Saving Accounts
df_credit = df_credit.merge(pd.get_dummies(df_credit["savings_account"], drop_first=True, prefix='savings'), left_index=True, right_index=True)
# # Housing get Risk
# df_credit = df_credit.merge(pd.get_dummies(df_credit.risk, prefix='risk'), left_index=True, right_index=True)
# Housing get Checking Account
df_credit = df_credit.merge(pd.get_dummies(df_credit["checking_account"], drop_first=True, prefix='check'), left_index=True, right_index=True)
# Housing get Age categorical
df_credit = df_credit.merge(pd.get_dummies(df_credit["age_group"], drop_first=True, prefix='age_cat'), left_index=True, right_index=True)

In [24]:
df_credit = df_credit.merge(pd.get_dummies(df_credit["credit_amount_range"], drop_first=True, prefix='cred_amt'), left_index=True, right_index=True)

In [25]:
df_credit

Unnamed: 0,age,sex,job,housing,savings_account,checking_account,credit_amount,duration,purpose,risk,...,housing_rent,savings_moderate,savings_quite rich,savings_rich,check_moderate,check_rich,age_cat_senior,age_cat_young,cred_amt_low,cred_amt_medium
0,67,male,2,own,little,little,1169,6,radio/TV,good,...,False,False,False,False,False,False,True,False,True,False
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,...,False,False,False,False,True,False,False,True,False,True
2,49,male,1,own,little,little,2096,12,education,good,...,False,False,False,False,False,False,False,False,True,False
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,...,False,False,False,False,False,False,False,False,False,True
4,53,male,2,free,little,little,4870,24,car,bad,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,little,1736,12,furniture/equipment,good,...,False,False,False,False,False,False,False,False,True,False
996,40,male,3,own,little,little,3857,30,car,good,...,False,False,False,False,False,False,False,False,True,False
997,38,male,2,own,little,little,804,12,radio/TV,good,...,False,False,False,False,False,False,False,False,True,False
998,23,male,2,free,little,little,1845,45,radio/TV,bad,...,False,False,False,False,False,False,False,True,True,False


In [26]:
#Excluding the missing columns
del df_credit["savings_account"]
del df_credit["checking_account"]
del df_credit["purpose"]
del df_credit["sex"]
del df_credit["housing"]
del df_credit["age_group"]


In [27]:
del df_credit['credit_amount_range']

In [28]:
df_credit

Unnamed: 0,age,job,credit_amount,duration,risk,purpose_car,purpose_domestic appliances,purpose_education,purpose_furniture/equipment,purpose_radio/TV,...,housing_rent,savings_moderate,savings_quite rich,savings_rich,check_moderate,check_rich,age_cat_senior,age_cat_young,cred_amt_low,cred_amt_medium
0,67,2,1169,6,good,False,False,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,22,2,5951,48,bad,False,False,False,False,True,...,False,False,False,False,True,False,False,True,False,True
2,49,1,2096,12,good,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
3,45,2,7882,42,good,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
4,53,2,4870,24,bad,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,31,1,1736,12,good,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
996,40,3,3857,30,good,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
997,38,2,804,12,good,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
998,23,2,1845,45,bad,False,False,False,False,True,...,False,False,False,False,False,False,False,True,True,False


In [29]:
df_credit['risk'].replace({'bad':1, 'good':0}, inplace=True)

In [30]:
df_credit

Unnamed: 0,age,job,credit_amount,duration,risk,purpose_car,purpose_domestic appliances,purpose_education,purpose_furniture/equipment,purpose_radio/TV,...,housing_rent,savings_moderate,savings_quite rich,savings_rich,check_moderate,check_rich,age_cat_senior,age_cat_young,cred_amt_low,cred_amt_medium
0,67,2,1169,6,0,False,False,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,22,2,5951,48,1,False,False,False,False,True,...,False,False,False,False,True,False,False,True,False,True
2,49,1,2096,12,0,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
3,45,2,7882,42,0,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
4,53,2,4870,24,1,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,31,1,1736,12,0,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
996,40,3,3857,30,0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
997,38,2,804,12,0,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
998,23,2,1845,45,1,False,False,False,False,True,...,False,False,False,False,False,False,False,True,True,False


In [31]:
X=df_credit.drop('risk', axis=1)
y=df_credit['risk']

In [32]:
# Import necessary libraries
import mlflow
import mlflow.sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define a list of models to train
models = [
    ("RandomForest", RandomForestClassifier()),
    ("LogisticRegression",  LogisticRegression(solver='lbfgs', max_iter=1000)),
    ("GradientBoosting", GradientBoostingClassifier()),
    # ("SVM", SVC(kernel='linear')),
    ("DecisionTree", DecisionTreeClassifier()),
    ("LDA", LinearDiscriminantAnalysis()),
    ("XGB", XGBClassifier())
    # Add more models here as needed
]


In [33]:

# Iterate over models
for model_name, model in models:
    with mlflow.start_run(run_name=model_name):  # Start MLflow run
        # Train the model
        model.fit(X_train, y_train)

    # Enable autologging to automatically log parameters not explicitly logged
        #mlflow.autolog()

        # # The configuration method trains the model on the data
        # model = config_method()

         # Make predictions
        y_pred = model.predict(X_test)


        acc, conf, betaf = eval_metrics(y_test, y_pred)


        # Collect and log the metrics to MLflow
        metrics = {
          "accuracy": acc,
          "betaf": betaf,
        }

        mlflow.log_metrics(metrics)
        print(f'Metrics: {metrics}')

        # accuracy = accuracy_score(y_test, y_pred)

        # conf=confusion_matrix(y_test, y_pred)
        # # betaf=fbeta_score(y_test, y_pred)

        # # Log model parameters and metrics to MLflow
        # mlflow.log_param("model", model_name)
        # mlflow.log_metric("accuracy", accuracy)
        # mlflow.log_metric("confusion_matrix",  conf)
        # # mlflow.log_metric("fbeta_score", betaf)




        # # Collect and log the parameters to MLflow
        # params = {
        #   'lr_mlflow_param_custom': model.optimizer.learning_rate.numpy(),
        #   'momentum_mlflow_param_custom': model.optimizer.momentum,
        #   'optimiser_param_custom': model.optimizer.__class__.__name__
        # }
        # mlflow.log_params(params)
                # Save the model in MLflow
        mlflow.sklearn.log_model(model, model_name)




# End MLflow run
mlflow.end_run()


Metrics: {'accuracy': 0.7033333333333334, 'betaf': 0.2375}




Metrics: {'accuracy': 0.7033333333333334, 'betaf': 0.270935960591133}
Metrics: {'accuracy': 0.6833333333333333, 'betaf': 0.2669902912621359}
Metrics: {'accuracy': 0.6033333333333334, 'betaf': 0.3879310344827586}
Metrics: {'accuracy': 0.69, 'betaf': 0.23514851485148514}
Metrics: {'accuracy': 0.6666666666666666, 'betaf': 0.30588235294117644}


In [34]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score


In [35]:
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [36]:
param_grid['n_estimators']

[5, 10, 15, 20, 25, 30]

In [37]:
rf1=RandomForestClassifier()


In [38]:
rf1.get_params

In [39]:



# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# # Set up GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')

# Start an MLflow run
with mlflow.start_run(run_name='rf-tuning-final'):
  for n_est in param_grid['n_estimators']:
    for max_d in param_grid['max_depth']:
      for min_samp_s in param_grid['min_samples_split']:
        for min_samp_l in param_grid['min_samples_leaf']:
          with mlflow.start_run(run_name=f"random_forest_{n_est}_{max_d}_{min_samp_s}_{min_samp_l}", nested=True):
      # # Fit the model
      # grid_search.fit(X_train, y_train)

      # # Get the best model
      # best_rf = grid_search.best_estimator_

            rf = RandomForestClassifier(random_state=42, n_estimators=n_est, max_depth=max_d, min_samples_split=min_samp_s, min_samples_leaf=min_samp_l)
            rf.fit(X_train, y_train)

            # Predict on the test set
            y_pred = rf.predict(X_test)

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)

            # Log parameters and metrics to MLflow
            # mlflow.log_params(grid_search.best_params_)
            mlflow.log_metric("accuracy", accuracy)


            # Log the model
            mlflow.sklearn.log_model(rf, f"random_forest")

            params = rf.get_params()
            print(params)
            # print(f"Best parameters: {rf. }")
            print(f"Test Accuracy: {accuracy}")

print("Model training and hyperparameter tuning complete. Results logged in MLflow.")


{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 5, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
Test Accuracy: 0.6933333333333334
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 5, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
Test Accuracy: 0.6766666666666666
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_s

In [40]:
from  pyngrok import ngrok

ngrok.kill()

NGROK_AUTH_TOKEN="2gAq5QQAevQ37OCEKWcsoglpPwF_4MJv4m6MA9Y1roeL5hkUY"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

ngrok_tunnel=ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)





MLflow Tracking UI: https://8508-34-27-60-5.ngrok-free.app


In [None]:
!mlflow ui

[2024-05-18 06:12:25 +0000] [5695] [INFO] Starting gunicorn 22.0.0
[2024-05-18 06:12:25 +0000] [5695] [INFO] Listening at: http://127.0.0.1:5000 (5695)
[2024-05-18 06:12:25 +0000] [5695] [INFO] Using worker: sync
[2024-05-18 06:12:25 +0000] [5700] [INFO] Booting worker with pid: 5700
[2024-05-18 06:12:25 +0000] [5701] [INFO] Booting worker with pid: 5701
[2024-05-18 06:12:25 +0000] [5702] [INFO] Booting worker with pid: 5702
[2024-05-18 06:12:25 +0000] [5703] [INFO] Booting worker with pid: 5703
