In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import os

# Load the dataset (assuming the data is already in a CSV-like format)
# Replace this with pd.read_csv("your_file.csv") if you have a file
data = pd.read_csv("/content/cleaned_origin_data.csv")  # Adjust path if needed

# Ensure the 'year' column is numeric
data['year'] = pd.to_numeric(data['year'], errors='coerce')

# Directory to save plots
output_dir = "origin_forecast_plots"
os.makedirs(output_dir, exist_ok=True)

# Forecasting parameters
forecast_years = list(range(2023, 2033))  # 2023-2032
forecast_list = []

# Iterate over each combination of origin and academic_type
for (origin, academic_type), group in data.groupby(['origin', 'academic_type']):
    # Aggregate students by year for this origin and academic_type
    yearly_students = group.groupby('year')['students'].sum().reset_index()
    yearly_students['year_numeric'] = yearly_students['year']

    # Skip if insufficient data (e.g., fewer than 2 points for regression)
    if len(yearly_students) < 2:
        print(f"Skipping {origin}, {academic_type}: insufficient data points ({len(yearly_students)})")
        continue

    # Train a linear regression model
    X = yearly_students[['year_numeric']]
    y = yearly_students['students']
    model = LinearRegression()
    model.fit(X, y)

    # Calculate accuracy metrics
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print(f"{origin}, {academic_type} - R²: {r2:.4f}, RMSE: {rmse:.2f}")

    # Predict future years
    future_years_df = pd.DataFrame({'year_numeric': forecast_years})
    forecast = model.predict(future_years_df)

    # Get the origin_region from the original group (assuming it's consistent within the group)
    origin_region = group['origin_region'].iloc[0]

    # Store cleaned forecast values (no negatives, rounded to integers)
    for i, year in enumerate(forecast_years):
        forecast_list.append({
            'year': year,
            'origin_region': origin_region,
            'origin': origin,
            'academic_type': academic_type,
            'students': int(max(0, forecast[i]))  # Ensure non-negative, integer values
        })

    # Plot historical and forecasted data
    plt.figure(figsize=(12, 6))
    plt.plot(yearly_students['year_numeric'], yearly_students['students'], 'o-', label='Historical Data')
    plt.plot(forecast_years, [int(max(0, f)) for f in forecast], 'o--', label='Forecast (2023-2032)', color='green')
    plt.xlabel('Year')
    plt.ylabel('Number of Students')
    plt.title(f'Student Enrollment Forecast for {origin} - {academic_type} (2000-2032)')
    plt.grid(True)
    plt.legend()

    # Set x-ticks from 2000 to 2032 with spacing
    x_ticks = list(range(2000, 2033, 2))
    plt.xticks(x_ticks, rotation=45)
    plt.tight_layout()

    # Save the plot
    safe_origin = origin.replace(' ', '_').replace('/', '_').replace(',', '_').replace('"', '')
    safe_academic_type = academic_type.replace(' ', '_')
    plot_filename = f"{output_dir}/{safe_origin}_{safe_academic_type}_forecast.png"
    plt.savefig(plot_filename, dpi=300)
    plt.close()

# Create a DataFrame from the forecast list
forecast_df = pd.DataFrame(forecast_list)

# Save the forecast to CSV
output_file = 'forecast_2023_2032_by_origin.csv'
forecast_df.to_csv(output_file, index=False)
print(f"Forecast saved to '{output_file}'")
print(f"Plots saved in '{output_dir}' directory")

# Combine historical and forecasted data into one CSV
# Historical data retains all columns
historical_data = data[['year', 'origin_region', 'origin', 'academic_type', 'students']].rename(columns={'students': 'actual_students'})

# Forecasted data renamed for clarity
forecast_data = forecast_df.rename(columns={'students': 'predicted_students'})

# Merge historical and forecast data
combined_data = pd.concat([historical_data, forecast_data[['year', 'origin_region', 'origin', 'academic_type', 'predicted_students']]], ignore_index=True)

# Save combined data
combined_output_file = 'origin_data_2000_2032.csv'
combined_data.to_csv(combined_output_file, index=False)
print(f"Combined historical and forecasted data saved to '{combined_output_file}'")

Afghanistan, Graduate - R²: 0.8281, RMSE: 37.54
Afghanistan, Non-Degree - R²: 0.0019, RMSE: 13.06
Afghanistan, OPT - R²: 0.2833, RMSE: 14.86
Afghanistan, Other - R²: 0.7344, RMSE: 9.74
Afghanistan, Undergraduate - R²: 0.3824, RMSE: 42.87
Africa, Subsaharan, Unspecified, Graduate - R²: 0.2414, RMSE: 0.42
Africa, Subsaharan, Unspecified, Non-Degree - R²: 1.0000, RMSE: 0.00
Africa, Subsaharan, Unspecified, OPT - R²: 0.0938, RMSE: 0.45
Africa, Subsaharan, Unspecified, Other - R²: 1.0000, RMSE: 0.00
Africa, Subsaharan, Unspecified, Undergraduate - R²: 0.0391, RMSE: 5.27
Albania, Graduate - R²: 0.2354, RMSE: 40.78
Albania, Non-Degree - R²: 0.0809, RMSE: 7.08
Albania, OPT - R²: 0.7731, RMSE: 13.20
Albania, Other - R²: 0.2300, RMSE: 12.86
Albania, Undergraduate - R²: 0.0168, RMSE: 153.01
Algeria, Graduate - R²: 0.1066, RMSE: 19.45
Algeria, Non-Degree - R²: 0.0894, RMSE: 5.13
Algeria, OPT - R²: 0.8435, RMSE: 3.53
Algeria, Other - R²: 0.3477, RMSE: 2.54
Algeria, Undergraduate - R²: 0.1942, RMSE: