In [2]:
import pandas as pd

# Load the dataset
dataset_url = "https://raw.githubusercontent.com/ScottLeng/AI-Project/refs/heads/main/clean_dataset.csv"
df = pd.read_csv(dataset_url)

In [3]:
df.head(10)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,1,1,Industrials,White,1.25,1,1,1,0,ByBirth,202,0,1
1,0,58.67,4.46,1,1,Materials,Black,3.04,1,1,6,0,ByBirth,43,560,1
2,0,24.5,0.5,1,1,Materials,Black,1.5,1,0,0,0,ByBirth,280,824,1
3,1,27.83,1.54,1,1,Industrials,White,3.75,1,1,5,1,ByBirth,100,3,1
4,1,20.17,5.625,1,1,Industrials,White,1.71,1,0,0,0,ByOtherMeans,120,0,1
5,1,32.08,4.0,1,1,CommunicationServices,White,2.5,1,0,0,1,ByBirth,360,0,1
6,1,33.17,1.04,1,1,Transport,Black,6.5,1,0,0,1,ByBirth,164,31285,1
7,0,22.92,11.585,1,1,InformationTechnology,White,0.04,1,0,0,0,ByBirth,80,1349,1
8,1,54.42,0.5,0,0,Financials,Black,3.96,1,0,0,0,ByBirth,180,314,1
9,1,42.5,4.915,0,0,Industrials,White,3.165,1,0,0,1,ByBirth,52,1442,1


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Handle missing values (fill with median for numerical, mode for categorical)
df.fillna(df.median(numeric_only=True), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

# Encode categorical features
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split dataset into features (X) and target (y)
X = df.drop(columns=['Approved'])  # Replace 'target_column' with actual column name
y = df['Approved']

# Split into training & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [10]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Start MLflow run
mlflow.set_experiment("MLflow Demo")

with mlflow.start_run():
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Log parameters and metrics
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)
    
    # Log model
    mlflow.sklearn.log_model(model, "random_forest_model")

print(f"Model logged with Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")




Model logged with Accuracy: 0.8551, F1-score: 0.8549


In [11]:
# Start MLflow run
mlflow.set_experiment("MLflow Demo")

with mlflow.start_run():
    # Train model
    model = RandomForestClassifier(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Log parameters and metrics
    mlflow.log_param("n_estimators", 50)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)
    
    # Log model
    mlflow.sklearn.log_model(model, "random_forest_model")

print(f"Model logged with Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")




Model logged with Accuracy: 0.8623, F1-score: 0.8622


In [12]:
# Start MLflow run
mlflow.set_experiment("MLflow Demo")

with mlflow.start_run():
    # Train model
    model = RandomForestClassifier(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Log parameters and metrics
    mlflow.log_param("n_estimators", 200)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)
    
    # Log model
    mlflow.sklearn.log_model(model, "random_forest_model")

print(f"Model logged with Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")




Model logged with Accuracy: 0.8768, F1-score: 0.8766
