In [None]:
import pandas as pd
import random
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Generate data
rows = 400
data = {
    "Application_ID": [f"App_{i:03d}" for i in range(1, rows + 1)],
    "Date_of_Application": [
        (datetime(2023, 1, 1) + timedelta(days=random.randint(0, 364))).strftime("%Y-%m-%d")
        for _ in range(rows)
    ],
    "Student_Age": [random.randint(18, 35) for _ in range(rows)],
    "Gender": random.choices(["Male", "Female", "Other"], weights=[0.45, 0.45, 0.1], k=rows),
    "School_Name": random.choices(
        ["University_A", "University_B", "Polytechnic_C", "College_D"], k=rows
    ),
    "Program_of_Study": random.choices(
        ["Engineering", "Medicine", "Business", "Humanities", "Social Sciences"], k=rows
    ),
    "Year_of_Study": [random.randint(1, 5) for _ in range(rows)],
    "GPA": [round(random.uniform(2.0, 5.0), 2) for _ in range(rows)],
    "Household_Income": [random.randint(50000, 1000000) for _ in range(rows)],
    "Socioeconomic_Status": random.choices(["Low", "Middle", "High"], weights=[0.5, 0.3, 0.2], k=rows),
    "Dependent_Family_Members": [random.randint(0, 6) for _ in range(rows)],
    "Employment_Status": random.choices(["Unemployed", "Part-time", "Full-time"], weights=[0.6, 0.3, 0.1], k=rows),
    "Extra_Curricular_Involvement": random.choices(["High", "Medium", "Low"], k=rows),
    "Previous_Loan_Status": random.choices(["Yes", "No"], weights=[0.3, 0.7], k=rows),
    "Loan_Amount_Requested": [random.randint(50000, 500000) for _ in range(rows)],
    "Academic_Performance_Rating": random.choices(
        ["Excellent", "Good", "Average", "Poor"], weights=[0.2, 0.5, 0.25, 0.05], k=rows
    ),
    "Special_Needs": random.choices(["Yes", "No"], weights=[0.1, 0.9], k=rows),
    "Campus_Residence": random.choices(["Yes", "No"], weights=[0.4, 0.6], k=rows),
    "Parental_Education_Level": random.choices(["Primary", "Secondary", "Tertiary"], k=rows),
    "Volunteer_Work": random.choices(["High", "Medium", "Low"], k=rows),
}

# Create DataFrame
synthetic_data = pd.DataFrame(data)

# Save to CSV
synthetic_data.to_csv("student_loan_application_data.csv", index=False)

print("Synthetic dataset generated and saved as 'student_loan_application_data.csv'")

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# Load synthetic dataset
data = pd.read_csv("student_loan_application_data.csv")

# Add a target column: Loan Approval (1 = Approved, 0 = Denied)
data["Loan_Approval"] = data["Loan_Amount_Requested"].apply(lambda x: 1 if x <= 300000 else 0)

# Define features and target
X = data.drop(columns=["Application_ID", "Date_of_Application", "Loan_Approval"])
y = data["Loan_Approval"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
numeric_features = ["Student_Age", "Year_of_Study", "GPA", "Household_Income", "Dependent_Family_Members"]
numeric_transformer = StandardScaler()

categorical_features = [
    "Gender",
    "School_Name",
    "Program_of_Study",
    "Socioeconomic_Status",
    "Employment_Status",
    "Extra_Curricular_Involvement",
    "Previous_Loan_Status",
    "Academic_Performance_Rating",
    "Special_Needs",
    "Campus_Residence",
    "Parental_Education_Level",
    "Volunteer_Work",
]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Full pipeline with classifier
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]  # Probabilities for approval
print(classification_report(y_test, y_pred))

# Save the pipeline
joblib.dump(pipeline, "student_loan_pipeline_with_probs.pkl")
print("Model saved as 'student_loan_pipeline_with_probs.pkl'")