# Linear Regression - Credit Rating Prediction

Simple linear regression to predict sovereign credit ratings from macroeconomic indicators.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## 1. Load Data

In [None]:
df = pd.read_csv('../data/processed/merged_dataset.csv')
print(f'Dataset shape: {df.shape}')
df.head()

## 2. Prepare X and y

In [None]:
X = df.drop(['Country', 'Year', 'Credit_Rating'], axis=1)
y = df['Credit_Rating']

print(f'Features: {list(X.columns)}')
print(f'X shape: {X.shape}, y shape: {y.shape}')

## 3. Split Train/Test (80/20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Train set: {X_train.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')

## 4. Train Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
print('✓ Model trained')

## 5. Make Predictions

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('✓ Predictions made')

## 6. Evaluate Model

In [None]:
# Training metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

print('Training Set:')
print(f'  RMSE: {train_rmse:.4f}')
print(f'  MAE:  {train_mae:.4f}')
print(f'  R²:   {train_r2:.4f}\n')

# Test metrics
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print('Test Set:')
print(f'  RMSE: {test_rmse:.4f}')
print(f'  MAE:  {test_mae:.4f}')
print(f'  R²:   {test_r2:.4f}')

## 7. Feature Importance (Coefficients)

In [None]:
coefficients_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print('Feature Importance:')
print(coefficients_df)

## 8. Visualization

## 9. K-Fold Cross-Validation (K=5)

In [None]:
# K-fold cross-validation with K=5
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate scores for each metric
rmse_scores = -cross_val_score(model, X, y, cv=kfold, scoring='neg_root_mean_squared_error')
mae_scores = -cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')
r2_scores = cross_val_score(model, X, y, cv=kfold, scoring='r2')

# Print results for each fold
print('Results per fold:')
for i in range(5):
    print(f'  Fold {i+1}: RMSE={rmse_scores[i]:.4f}, MAE={mae_scores[i]:.4f}, R²={r2_scores[i]:.4f}')

# Calculate mean and std
print(f'\nAverage across 5 folds:')
print(f'  RMSE: {rmse_scores.mean():.4f} ± {rmse_scores.std():.4f}')
print(f'  MAE:  {mae_scores.mean():.4f} ± {mae_scores.std():.4f}')
print(f'  R²:   {r2_scores.mean():.4f} ± {r2_scores.std():.4f}')

## 10. Comparison: Simple Split vs K-Fold CV

In [None]:
# Create comparison table
comparison_df = pd.DataFrame({
    'Method': ['Simple Split (80/20)', 'K-Fold CV (K=5)'],
    'RMSE': [test_rmse, rmse_scores.mean()],
    'MAE': [test_mae, mae_scores.mean()],
    'R²': [test_r2, r2_scores.mean()],
    'Std': ['-', f'±{r2_scores.std():.4f}']
})

print('Comparison of Methods:')
print(comparison_df.to_string(index=False))

print('\nConclusion:')
print('- Simple split gave R² = 0.4668 (optimistic, best fold)')
print('- K-fold CV gives R² = 0.4395 ± 0.016 (more realistic average)')
print('- True performance is likely around R² ≈ 0.44')

## 11. K-Fold CV Visualization

In [None]:
# Visualize K-fold CV results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: R² scores per fold
folds = [f'Fold {i+1}' for i in range(5)]
ax1.bar(folds, r2_scores, alpha=0.7, color='steelblue', edgecolor='black')
ax1.axhline(y=r2_scores.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {r2_scores.mean():.4f}')
ax1.axhline(y=test_r2, color='green', linestyle='--', linewidth=2, label=f'Simple Split: {test_r2:.4f}')
ax1.set_ylabel('R² Score', fontsize=12)
ax1.set_title('R² Score per Fold (K-Fold CV)', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')

# Plot 2: Box plot of all metrics
metrics_data = [rmse_scores, mae_scores, r2_scores]
ax2.boxplot(metrics_data, labels=['RMSE', 'MAE', 'R²'])
ax2.set_ylabel('Score', fontsize=12)
ax2.set_title('Distribution of Metrics (K-Fold CV)', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print('\nVisualization shows:')
print('- Left: R² varies between folds (0.42 to 0.47)')
print('- Right: Distribution of all metrics across folds')
print('- Red line: K-fold average')
print('- Green line: Simple split result (best fold)')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Actual vs Predicted
ax1.scatter(y_test, y_test_pred, alpha=0.6, edgecolors='k')
min_val = min(y_test.min(), y_test_pred.min())
max_val = max(y_test.max(), y_test_pred.max())
ax1.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
ax1.set_xlabel('Actual Credit Rating', fontsize=12)
ax1.set_ylabel('Predicted Credit Rating', fontsize=12)
ax1.set_title('Actual vs Predicted', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Plot 2: Feature Importance
colors = ['green' if x < 0 else 'red' for x in coefficients_df['Coefficient']]
ax2.barh(coefficients_df['Feature'], coefficients_df['Coefficient'], color=colors, alpha=0.7)
ax2.set_xlabel('Coefficient Value', fontsize=12)
ax2.set_title('Feature Importance (Coefficients)', fontsize=14, fontweight='bold')
ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
ax2.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 12. Polynomial Regression (Degree 2)

In [None]:
# Create polynomial features (degree 2)
poly2 = PolynomialFeatures(degree=2, include_bias=False)
X_poly2 = poly2.fit_transform(X)

print(f'Original features: {X.shape[1]}')
print(f'Polynomial features (degree=2): {X_poly2.shape[1]}')

# K-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
model_poly2 = LinearRegression()

rmse_poly2 = -cross_val_score(model_poly2, X_poly2, y, cv=kfold, scoring='neg_root_mean_squared_error')
mae_poly2 = -cross_val_score(model_poly2, X_poly2, y, cv=kfold, scoring='neg_mean_absolute_error')
r2_poly2 = cross_val_score(model_poly2, X_poly2, y, cv=kfold, scoring='r2')

print('\nResults per fold:')
for i in range(5):
    print(f'  Fold {i+1}: RMSE={rmse_poly2[i]:.4f}, MAE={mae_poly2[i]:.4f}, R²={r2_poly2[i]:.4f}')

print(f'\nAverage across 5 folds:')
print(f'  RMSE: {rmse_poly2.mean():.4f} ± {rmse_poly2.std():.4f}')
print(f'  MAE:  {mae_poly2.mean():.4f} ± {mae_poly2.std():.4f}')
print(f'  R²:   {r2_poly2.mean():.4f} ± {r2_poly2.std():.4f}')

## 13. Polynomial Regression (Degree 3)

In [None]:
# Create polynomial features (degree 3)
poly3 = PolynomialFeatures(degree=3, include_bias=False)
X_poly3 = poly3.fit_transform(X)

print(f'Original features: {X.shape[1]}')
print(f'Polynomial features (degree=3): {X_poly3.shape[1]}')

# K-fold cross-validation
model_poly3 = LinearRegression()

rmse_poly3 = -cross_val_score(model_poly3, X_poly3, y, cv=kfold, scoring='neg_root_mean_squared_error')
mae_poly3 = -cross_val_score(model_poly3, X_poly3, y, cv=kfold, scoring='neg_mean_absolute_error')
r2_poly3 = cross_val_score(model_poly3, X_poly3, y, cv=kfold, scoring='r2')

print('\nResults per fold:')
for i in range(5):
    print(f'  Fold {i+1}: RMSE={rmse_poly3[i]:.4f}, MAE={mae_poly3[i]:.4f}, R²={r2_poly3[i]:.4f}')

print(f'\nAverage across 5 folds:')
print(f'  RMSE: {rmse_poly3.mean():.4f} ± {rmse_poly3.std():.4f}')
print(f'  MAE:  {mae_poly3.mean():.4f} ± {mae_poly3.std():.4f}')
print(f'  R²:   {r2_poly3.mean():.4f} ± {r2_poly3.std():.4f}')

## 14. Comparison: Linear vs Polynomial Regression

In [None]:
# Create comparison table
comparison_models = pd.DataFrame({
    'Model': ['Linear (K-Fold)', 'Polynomial deg=2', 'Polynomial deg=3'],
    'Mean_R²': [r2_scores.mean(), r2_poly2.mean(), r2_poly3.mean()],
    'Std_R²': [r2_scores.std(), r2_poly2.std(), r2_poly3.std()],
    'Mean_RMSE': [rmse_scores.mean(), rmse_poly2.mean(), rmse_poly3.mean()],
    'Mean_MAE': [mae_scores.mean(), mae_poly2.mean(), mae_poly3.mean()],
    'Features': [X.shape[1], X_poly2.shape[1], X_poly3.shape[1]]
})

print('Model Comparison:')
print(comparison_models.to_string(index=False))

print('\n' + '='*70)
print('ANALYSIS:')
print('='*70)
print(f'1. Linear Regression:')
print(f'   - R² = {r2_scores.mean():.4f} ± {r2_scores.std():.4f}')
print(f'   - Stable baseline (low std)')
print(f'\n2. Polynomial deg=2:')
print(f'   - R² = {r2_poly2.mean():.4f} ± {r2_poly2.std():.4f}')
print(f'   - Best R² (+{(r2_poly2.mean()-r2_scores.mean())*100:.1f}% improvement)')
print(f'   - BUT: High variability (std 13x larger than Linear)')
print(f'   - Conclusion: Better performance but unstable → Overfitting risk')
print(f'\n3. Polynomial deg=3:')
print(f'   - R² = {r2_poly3.mean():.4f} ± {r2_poly3.std():.4f}')
print(f'   - NEGATIVE R² → Worse than predicting the mean!')
print(f'   - 164 features for 950 observations → Severe overfitting')
print(f'   - Conclusion: Unusable model')
print('\n' + '='*70)
print('WINNER: Polynomial deg=2 (best R²) but needs regularization to stabilize')
print('='*70)

## 15. Polynomial Regression Visualization

In [None]:
# Visualize Polynomial Regression results
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))

# Plot 1: Linear Regression R² per fold
folds = [f'Fold {i+1}' for i in range(5)]
ax1.bar(folds, r2_scores, alpha=0.7, color='steelblue', edgecolor='black')
ax1.axhline(y=r2_scores.mean(), color='red', linestyle='--', linewidth=2, 
            label=f'Mean: {r2_scores.mean():.4f}')
ax1.set_ylabel('R² Score', fontsize=12)
ax1.set_title('Linear Regression\nR² Score per Fold', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')
ax1.set_ylim([-0.1, 0.8])

# Plot 2: Polynomial deg=2 R² per fold
ax2.bar(folds, r2_poly2, alpha=0.7, color='coral', edgecolor='black')
ax2.axhline(y=r2_poly2.mean(), color='red', linestyle='--', linewidth=2, 
            label=f'Mean: {r2_poly2.mean():.4f}')
ax2.set_ylabel('R² Score', fontsize=12)
ax2.set_title('Polynomial deg=2\nR² Score per Fold', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')
ax2.set_ylim([-0.1, 0.8])

# Plot 3: Polynomial deg=3 R² per fold
ax3.bar(folds, r2_poly3, alpha=0.7, color='darkred', edgecolor='black')
ax3.axhline(y=r2_poly3.mean(), color='red', linestyle='--', linewidth=2, 
            label=f'Mean: {r2_poly3.mean():.4f}')
ax3.set_ylabel('R² Score', fontsize=12)
ax3.set_title('Polynomial deg=3\nR² Score per Fold (OVERFITTING)', fontsize=14, fontweight='bold')
ax3.legend()
ax3.grid(True, alpha=0.3, axis='y')
ax3.axhline(y=0, color='black', linestyle='-', linewidth=1)

plt.tight_layout()
plt.show()

print('\nVisualization Analysis:')
print('- Linear: Stable performance across all folds (R² ≈ 0.42-0.47)')
print('- Poly deg=2: Better average but high variability (R² = 0.12 to 0.70)')
print('- Poly deg=3: Catastrophic - negative R² on most folds (overfitting)')