<a href="https://colab.research.google.com/github/Sneha795/ML_LAB/blob/main/ML2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

n_samples = 5000

# Possible categories
age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
employment_statuses = ['Employed', 'Self-Employed', 'Unemployed', 'Retired']
credit_scores = ['Poor', 'Fair', 'Good', 'Excellent']

# Randomly assign categories
ages = np.random.choice(age_groups, size=n_samples, p=[0.15, 0.25, 0.2, 0.15, 0.15, 0.1])
employment = np.random.choice(employment_statuses, size=n_samples, p=[0.5, 0.15, 0.2, 0.15])
credit = np.random.choice(credit_scores, size=n_samples, p=[0.1, 0.3, 0.4, 0.2])

# Generate annual income based on employment status (approximate ranges)
income = []
for emp in employment:
    if emp == 'Employed':
        income.append(np.random.normal(60000, 15000))
    elif emp == 'Self-Employed':
        income.append(np.random.normal(70000, 20000))
    elif emp == 'Unemployed':
        income.append(np.random.normal(20000, 5000))
    else:  # Retired
        income.append(np.random.normal(30000, 10000))

income = np.clip(income, 10000, 150000).astype(int)

# Logic for loan approval (just a simple rule for demonstration)
# Higher income and better credit -> higher chance of approval
approval_prob = []
for inc, cs in zip(income, credit):
    base_prob = 0.3
    if inc > 50000:
        base_prob += 0.3
    if cs == 'Excellent':
        base_prob += 0.3
    elif cs == 'Good':
        base_prob += 0.2
    elif cs == 'Fair':
        base_prob += 0.1
    approval_prob.append(min(base_prob, 0.95))

loan_approved = np.array([np.random.binomial(1, p) for p in approval_prob])
loan_approved_label = np.where(loan_approved == 1, 'Yes', 'No')

# Create DataFrame
df = pd.DataFrame({
    'Age Group': ages,
    'Employment Status': employment,
    'Annual Income': income,
    'Credit Score': credit,
    'Loan Approved': loan_approved_label
})

print(df.head())

# Save to CSV
df.to_csv('loan_approval_dataset.csv', index=False)
print("Dataset saved as 'loan_approval_dataset.csv'")


  Age Group Employment Status  Annual Income Credit Score Loan Approved
0     25-34          Employed          85211         Fair           Yes
1       65+          Employed          59966         Fair           Yes
2     45-54           Retired          21819         Fair            No
3     35-44          Employed          61845         Good            No
4     25-34           Retired          24494         Good            No
Dataset saved as 'loan_approval_dataset.csv'


In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
import pandas as pd

# Assuming df is your DataFrame with the generated data

categorical_cols = ['Age Group', 'Employment Status', 'Credit Score']
numerical_cols = ['Annual Income']
target_col = 'Loan Approved'

# Encode target if categorical
if df[target_col].dtype == 'object':
    le = LabelEncoder()
    df[target_col] = le.fit_transform(df[target_col])

X_cat = df[categorical_cols]
X_num = df[numerical_cols]
y = df[target_col].values

encoder = OneHotEncoder(drop='first', sparse_output=False)
X_cat_encoded = encoder.fit_transform(X_cat)
X = np.hstack([X_num.values, X_cat_encoded])

model = LogisticRegression(max_iter=1000)

# --- 2-way split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("=== 2-Way Split ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# --- 3-way split ---
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

model.fit(X_train, y_train)
val_pred = model.predict(X_val)
test_pred = model.predict(X_test)
print("\n=== 3-Way Split ===")
print("Validation Accuracy:", accuracy_score(y_val, val_pred))
print("Validation Classification Report:\n", classification_report(y_val, val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, val_pred))

print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Test Classification Report:\n", classification_report(y_test, test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, test_pred))

# --- 5-Fold Cross Validation ---
from sklearn.metrics import precision_recall_fscore_support

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_accuracies = []
cv_precisions = []
cv_recalls = []
cv_f1s = []

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train_cv, X_test_cv = X[train_index], X[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]

    model.fit(X_train_cv, y_train_cv)
    y_pred_cv = model.predict(X_test_cv)

    acc = accuracy_score(y_test_cv, y_pred_cv)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test_cv, y_pred_cv, average='binary')

    cv_accuracies.append(acc)
    cv_precisions.append(prec)
    cv_recalls.append(rec)
    cv_f1s.append(f1)

    print(f"\nFold {fold} Metrics:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test_cv, y_pred_cv))

print("\n=== 5-Fold CV Average Metrics ===")
print(f"Average Accuracy: {np.mean(cv_accuracies):.4f}")
print(f"Average Precision: {np.mean(cv_precisions):.4f}")
print(f"Average Recall: {np.mean(cv_recalls):.4f}")
print(f"Average F1 Score: {np.mean(cv_f1s):.4f}")


=== 2-Way Split ===
Accuracy: 0.6573333333333333
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.34      0.42       553
           1       0.69      0.84      0.76       947

    accuracy                           0.66      1500
   macro avg       0.62      0.59      0.59      1500
weighted avg       0.64      0.66      0.63      1500

Confusion Matrix:
 [[189 364]
 [150 797]]

=== 3-Way Split ===
Validation Accuracy: 0.647
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.34      0.43       392
           1       0.66      0.85      0.74       608

    accuracy                           0.65      1000
   macro avg       0.63      0.59      0.59      1000
weighted avg       0.63      0.65      0.62      1000

Validation Confusion Matrix:
 [[133 259]
 [ 94 514]]
Test Accuracy: 0.645
Test Classification Report:
               precision    recall  f1-score  