# Linear Regression - Credit Rating Prediction

Simple linear regression to predict sovereign credit ratings from macroeconomic indicators.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## 1. Load Data

In [None]:
df = pd.read_csv('../data/processed/merged_dataset.csv')
print(f'Dataset shape: {df.shape}')
df.head()

## 2. Prepare X and y

In [None]:
X = df.drop(['Country', 'Year', 'Credit_Rating'], axis=1)
y = df['Credit_Rating']

print(f'Features: {list(X.columns)}')
print(f'X shape: {X.shape}, y shape: {y.shape}')

## 3. Split Train/Test (80/20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Train set: {X_train.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')

## 4. Train Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
print('✓ Model trained')

## 5. Make Predictions

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('✓ Predictions made')

## 6. Evaluate Model

In [None]:
# Training metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

print('Training Set:')
print(f'  RMSE: {train_rmse:.4f}')
print(f'  MAE:  {train_mae:.4f}')
print(f'  R²:   {train_r2:.4f}\n')

# Test metrics
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print('Test Set:')
print(f'  RMSE: {test_rmse:.4f}')
print(f'  MAE:  {test_mae:.4f}')
print(f'  R²:   {test_r2:.4f}')

## 7. Feature Importance (Coefficients)

In [None]:
coefficients_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print('Feature Importance:')
print(coefficients_df)

## 8. Visualization

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Actual vs Predicted
ax1.scatter(y_test, y_test_pred, alpha=0.6, edgecolors='k')
min_val = min(y_test.min(), y_test_pred.min())
max_val = max(y_test.max(), y_test_pred.max())
ax1.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
ax1.set_xlabel('Actual Credit Rating', fontsize=12)
ax1.set_ylabel('Predicted Credit Rating', fontsize=12)
ax1.set_title('Actual vs Predicted', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Plot 2: Feature Importance
colors = ['green' if x < 0 else 'red' for x in coefficients_df['Coefficient']]
ax2.barh(coefficients_df['Feature'], coefficients_df['Coefficient'], color=colors, alpha=0.7)
ax2.set_xlabel('Coefficient Value', fontsize=12)
ax2.set_title('Feature Importance (Coefficients)', fontsize=14, fontweight='bold')
ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
ax2.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()