In [125]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV, LinearRegression
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler


In [126]:
df_train = pd.read_csv('credit_installment2_id5.csv')
df_eval = pd.read_csv('credit_installment2_evaluation_data.csv')
#yy = df_eval['PRSM']
#df_eval = df_eval.drop(['PRSM'], axis=1)

In [127]:
def preprocess_data(df):
    """
    Preprocesses a dataset by performing feature engineering, 
    transformation, encoding, and scaling.
    """
    # Calculate Delinquent Credit Ratio
    df["Delinquent_Credit_Ratio"] = df["Num_Delinquent"] / df["Num_CreditLines"]
    
    # Cap 'Months' feature at 30
    df['Months'] = df['Months'].apply(lambda x: x if x < 30 else 30)
    
    # Create FICO bin labels
    bins = [300, 580, 670, 740, 800, float('inf')]
    labels = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
    df['FICO_bin'] = pd.cut(df['FICO'], bins=bins, labels=labels, right=False)
    
    # Extract first two digits from NAICS code
    df['NAICS_2digit'] = df['NAICS'].astype(str).str.zfill(6).str[:2]
    
    # Apply log transformation to skewed numerical features
    df['Stress'] = np.log1p(df['Stress'])
    df['Volume'] = np.log1p(df['Volume'])
    df['TotalAmtOwed'] = np.log1p(df['TotalAmtOwed'])
    
    # One-hot encode categorical features
    for col in ['FICO_bin', 'CorpStructure', 'NAICS_2digit']:
        df = pd.get_dummies(df, columns=[col], drop_first=True)
    
    # Drop original categorical columns
    df = df.drop(['FICO', 'NAICS'], axis=1)
    
    # Standardize numerical features
    cols_to_scale = ["Months", "Volume", "TotalAmtOwed", "Stress"]
    scaler = StandardScaler()
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    df[cols_to_scale] = df[cols_to_scale] * 0.25 + 0.5
    
    return df


In [128]:
baseline_values = {
    'FICO': 700,
    'TotalAmtOwed': 200000,
    'Volume': 140000,
    'Stress': 0.2,
    'Num_Delinquent': 4,
    'Num_CreditLines': 10,
    'WomanOwned': 1,
    'CorpStructure': 'LLC',
    'NAICS': 445291,
    'Months':18,
    'PRSM': 0
}

In [129]:
df_extended = pd.concat([df_train, pd.DataFrame([baseline_values])])

In [130]:
df_train = preprocess_data(df_train)
df_extended = preprocess_data(df_extended)

In [131]:
baseline_df = df_extended.tail(1).drop(columns=['PRSM'])

In [132]:
y = df_train['PRSM']
X = df_train.drop(columns=['PRSM'])

X = X.drop(['Num_Delinquent', 'Num_CreditLines', 'Volume'], axis=1)

In [133]:
X_train_sm = sm.add_constant(X) 
ols_model = sm.OLS(y, X_train_sm).fit()

In [134]:
baseline_df['const'] = 1
baseline_df = baseline_df.drop(['Num_Delinquent', 'Num_CreditLines', 'Volume'], axis=1)

In [135]:
predictions = ols_model.get_prediction(baseline_df)
summary_frame = predictions.summary_frame(alpha=0.05)  # 95% 置信区间

# 提取预测值和置信区间
predicted_prsm = summary_frame["mean"][0]
lower_bound = summary_frame["obs_ci_lower"][0]  # 观测置信区间下界
upper_bound = summary_frame["obs_ci_upper"][0]  # 观测置信区间上界

# 输出结果
print(f"Baseline Borrower Predicted PRSM: {predicted_prsm:.4f}")
print(f"95% Prediction Interval: ({lower_bound:.4f}, {upper_bound:.4f})")

Baseline Borrower Predicted PRSM: 0.9192
95% Prediction Interval: (0.5997, 1.2386)


In [136]:
df_eval = preprocess_data(df_eval)
df_eval = sm.add_constant(df_eval)
df_eval = df_eval.drop(['Num_Delinquent', 'Num_CreditLines', 'Volume'], axis=1)
predictions = ols_model.get_prediction(df_eval)
pred_summary = predictions.summary_frame(alpha=0.05)  

output_df = pred_summary[['mean', 'obs_ci_lower', 'obs_ci_upper']]
output_df.columns = ['Point_Prediction', 'Lower_Bound', 'Upper_Bound']

output_df.to_csv("predictions.csv", index=False)

yy = yy.squeeze()  # 转换为 Series 以匹配 output_df 结构

rmse = np.sqrt(mean_squared_error(yy, output_df["Point_Prediction"]))
print(f"RMSE: {rmse:.4f}")

covered = ((yy >= output_df["Lower_Bound"]) & (yy <= output_df["Upper_Bound"])).mean()
print(f"95% Prediction Interval Coverage: {covered:.2%}")