<a href="https://colab.research.google.com/github/ShreyG12345/DATA70202/blob/main/Linear_%26_Random_Forest_Combined_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#this code is sensitive and must not be shared publicly and can only be viewed by instructors.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

# Read the full dataset
data = pd.read_csv('processed_data.csv')

print(data.head())

# Define predictors and response variable
predictors = [
    'gender_encoded',
    'active_service_on_full_time_basis_2015_scheme',
    'pensionable_service_final_salary_scheme',
    'age_at_join',
    'years_until_state_pension',
    'rank',
    'part_time_proportion'
]

# Prepare dataset, drop missing values, if any
regression_data = data[predictors + ['actual_pay_over_12_months_final_salary_scheme']].dropna()

# Split predictors and response
X = regression_data[predictors]
y = regression_data['actual_pay_over_12_months_final_salary_scheme']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# -----------------------------------------------------------------
# LINEAR REGRESSION

# Create linear regression model
lr_model = LinearRegression()

# Fit the model
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse:.4f}')
print(f'R-squared: {r2:.4f}')

# Dataframe to view the coefficients:
coeff_df = pd.DataFrame({
    'Predictor': X.columns,
    'Coefficient': lr_model.coef_
})

print(coeff_df)

# Define the absolute values of the coefficients and add to the dataframe
coeff_df["Abs_Coefficient"] = coeff_df["Coefficient"].abs()
coeff_df_sorted = coeff_df.sort_values(by="Abs_Coefficient", ascending=False)

# Plot the absoolute coefficients
plt.figure(figsize=(12, 8))
sns.barplot(x="Abs_Coefficient", y="Predictor", data=coeff_df_sorted, palette="Blues_d")
plt.title("Linear Regression Coefficients by Absolute Value", fontsize=16)
plt.xlabel("Absolute Coefficient Value", fontsize=14)
plt.ylabel("Feature", fontsize=14)
plt.grid(False)
plt.tight_layout()
plt.show()



# --------------------------------------------------------------------
# RANDOM FOREST

# Create and fit the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate model performance
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Get feature importances
feature_importances = pd.Series(rf_model.feature_importances_, index=predictors).sort_values(ascending=False)

# Output results
feature_importances

rf_results = {
    "RMSE": rmse_rf,
    "R-squared": r2_rf,
    "Feature Importances": feature_importances
}

rf_results

# Plot 1: Feature Importances
plt.figure(figsize=(12, 6))  # Reduce from (30, 12) to something more manageable
sns.barplot(x=feature_importances.values, y=feature_importances.index, color='red')
plt.title("Feature Importances", fontsize=14)
plt.xlabel("Importance Score", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.grid(False)
plt.show()