In [1]:
# Cell 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression

In [2]:
# Cell 2: Load and Preprocess the Data

df = pd.read_csv('kiva_loans.csv')

# Convert time columns to datetime
time_columns = ['posted_time', 'funded_time']
for col in time_columns:
    df[col] = pd.to_datetime(df[col])

# Create the label (funding duration in hours)
df['funding_duration'] = (df['funded_time'] - df['posted_time']).dt.total_seconds() / 3600

# Extract year and month from posted_time
df['posted_year'] = df['posted_time'].dt.year
df['posted_month'] = df['posted_time'].dt.month

# Process borrower_genders into two binary features
def process_borrower_genders(gender_string):
    if pd.isna(gender_string):
        return 0, 0  # Default to no majority and single person
    genders = gender_string.split(',')
    num_men = genders.count('male')
    num_women = genders.count('female')
    multiple_people = 1 if len(genders) > 1 else 0
    majority_men_or_women = 1 if num_men > num_women else 0  # Choose majority, default to women if equal
    return multiple_people, majority_men_or_women

df['multiple_people'], df['majority_men_or_women'] = zip(*df['borrower_genders'].apply(process_borrower_genders))

# Prepare features
numeric_features = ['funded_amount', 'term_in_months']
categorical_features = ['activity', 'sector', 'country', 'partner_id', 
                        'repayment_interval', 'posted_year', 'posted_month', 
                        'multiple_people', 'majority_men_or_women']

# Combine all features
features = numeric_features + categorical_features

# drop rows with missing values in features or label
columns_to_check = features + ['funding_duration']
df = df.dropna(subset=columns_to_check)

# Create X (features) and y (target)
X = df[features]
y = df['funding_duration']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Cell 3: Define the Preprocessing Pipeline

# Create preprocessing steps
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [7]:
# Cell 4: Train and Evaluate the Random Forest Regressor

# Define the Random Forest Regressor Pipeline
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=10,       
        random_state=42,
    ))
])

# Train the Random Forest Regressor
print("Training Random Forest Regressor...")
rf_model.fit(X_train, y_train)
print("\nRandom Forest Regressor training completed.")

# Make Predictions with Random Forest on Test Data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest Regressor on Test Data
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

# Make Predictions with Random Forest on Training Data
y_train_pred_rf = rf_model.predict(X_train)

# Evaluate the Random Forest Regressor on Training Data
mse_rf_train = mean_squared_error(y_train, y_train_pred_rf)
rmse_rf_train = np.sqrt(mse_rf_train)
r2_rf_train = r2_score(y_train, y_train_pred_rf)
mae_rf_train = mean_absolute_error(y_train, y_train_pred_rf)

# Output Test Performance Results
print("\nRandom Forest Regressor Test Performance:")
print(f"Root Mean Squared Error: {rmse_rf:.2f}")
print(f"Mean Absolute Error: {mae_rf:.2f}")
print(f"R-squared Score: {r2_rf:.2f}")

# Output Training Performance Results
print("\nRandom Forest Regressor Training Performance:")
print(f"Root Mean Squared Error: {rmse_rf_train:.2f}")
print(f"Mean Absolute Error: {mae_rf_train:.2f}")
print(f"R-squared Score: {r2_rf_train:.2f}")


Training Random Forest Regressor...

Random Forest Regressor training completed.

Random Forest Regressor Test Performance:
Root Mean Squared Error: 180.87
Mean Absolute Error: 121.50
R-squared Score: 0.58

Random Forest Regressor Training Performance:
Root Mean Squared Error: 102.62
Mean Absolute Error: 64.22
R-squared Score: 0.86


In [6]:
# Cell 5: Train and Evaluate the MLP Regressor

# Define the MLP Regressor Pipeline
mlp_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(
        hidden_layer_sizes=(256, 128, 64, 32),
        activation='relu',             
        alpha=0.001,
        solver='adam', 
        random_state=13,
        max_iter=1000,
        early_stopping=True,    
        validation_fraction=0.1
    ))
])

# Train the MLP Regressor
print("Training MLP Regressor...")
mlp_model.fit(X_train, y_train)
print("\nMLP Regressor training completed.")

# Make Predictions with MLP Regressor on Test Data
y_pred_mlp = mlp_model.predict(X_test)

# Evaluate the MLP Regressor on Test Data
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
rmse_mlp = np.sqrt(mse_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)
mae_mlp = mean_absolute_error(y_test, y_pred_mlp)

# Make Predictions with MLP Regressor on Training Data
y_train_pred_mlp = mlp_model.predict(X_train)

# Evaluate the MLP Regressor on Training Data
mse_mlp_train = mean_squared_error(y_train, y_train_pred_mlp)
rmse_mlp_train = np.sqrt(mse_mlp_train)
r2_mlp_train = r2_score(y_train, y_train_pred_mlp)
mae_mlp_train = mean_absolute_error(y_train, y_train_pred_mlp)

# Output Test Performance Results
print("\nMLP Regressor Test Performance:")
print(f"Root Mean Squared Error: {rmse_mlp:.2f}")
print(f"Mean Absolute Error: {mae_mlp:.2f}")
print(f"R-squared Score: {r2_mlp:.2f}")

# Output Training Performance Results
print("\nMLP Regressor Training Performance:")
print(f"Root Mean Squared Error: {rmse_mlp_train:.2f}")
print(f"Mean Absolute Error: {mae_mlp_train:.2f}")
print(f"R-squared Score: {r2_mlp_train:.2f}")


Training MLP Regressor...

MLP Regressor training completed.

MLP Regressor Test Performance:
Root Mean Squared Error: 166.00
Mean Absolute Error: 115.25
R-squared Score: 0.65

MLP Regressor Training Performance:
Root Mean Squared Error: 156.63
Mean Absolute Error: 108.77
R-squared Score: 0.68


In [9]:
# Assuming you have already executed Cell 2 and have X_train, X_test, y_train, y_test defined

# 1. Compute the mean of the training target values
mean_funding_duration = y_train.mean()

# 2. Print the mean funding duration
print(f"Mean Funding Duration (Training Set): {mean_funding_duration:.2f} hours")

# 3. Create predictions by assigning the mean to all test instances
y_pred_mean = [mean_funding_duration] * len(y_test)

# 4. Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_mean)

# 5. Print the MAE
print(f"Mean Absolute Error (Baseline - Mean Prediction): {mae:.2f} hours")

Mean Funding Duration (Training Set): 336.07 hours
Mean Absolute Error (Baseline - Mean Prediction): 233.27 hours
