In [3]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/sample_data/field_of_study.csv')

# Clean the data
df['year'] = df['year'].str[:4].astype(int)  # Extract year and convert to integer
df['students'] = pd.to_numeric(df['students'], errors='coerce')  # Convert students to numeric, replacing non-numeric values with NaN
df = df.dropna()  # Remove rows with NaN values

# Group by year and field_of_study, summing the students
df_cleaned = df.groupby(['year', 'field_of_study'])['students'].sum().reset_index()

# Display the first few rows of the cleaned data
print(df_cleaned.head())

# Save the cleaned data to a new CSV file
df_cleaned.to_csv('cleaned_field_of_study_data.csv', index=False)
print("Cleaned data has been saved to 'cleaned_field_of_study_data.csv'")


   year                 field_of_study  students
0  2007                    Agriculture    8945.0
1  2007        Business and Management  110041.0
2  2007  Communications and Journalism   12135.0
3  2007                      Education   17775.0
4  2007                    Engineering   96189.0
Cleaned data has been saved to 'cleaned_field_of_study_data.csv'


In [4]:
null_values=df.isnull().sum()

In [34]:
print(null_values)

year              0
field_of_study    0
major             0
students          0
dtype: int64


In [35]:
from google.colab import files
files.download('cleaned_field_of_study_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:
df

Unnamed: 0,year,field_of_study,students
0,2007-01-01,Agriculture,8945.0
1,2007-01-01,Business and Management,110041.0
2,2007-01-01,Communications and Journalism,12135.0
3,2007-01-01,Education,17775.0
4,2007-01-01,Engineering,96189.0
...,...,...,...
235,2022-01-01,Math and Computer Science,240230.0
236,2022-01-01,Other Fields of Study,87962.0
237,2022-01-01,Physical and Life Sciences,84830.0
238,2022-01-01,Social Sciences,85998.0


In [32]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
import os

# Load the dataset
file_path = '/content/sample_data/cleaned_field_of_study_data.csv'
df = pd.read_csv(file_path)

# Ensure the 'year' column is properly formatted
df['year'] = df['year'].astype(str).str.split('/').str[0]  # Extract the first part of the year
df['year'] = pd.to_datetime(df['year'], format='%Y', errors='coerce')

# Handle missing values
df = df.ffill()  # Forward fill missing values

# Create an empty list to store forecast results
forecast_list = []

# Create a directory to store the plots
output_dir = "field_forecast_plots"
os.makedirs(output_dir, exist_ok=True)

# Forecasting parameters
forecast_steps = 10  # Forecast for 10 years (2023-2032)

# Iterate over each field_of_study
for field, group in df.groupby('field_of_study'):
    # Group by year and sum the students
    yearly_students = group.groupby('year')['students'].sum().reset_index()
    yearly_students.set_index('year', inplace=True)

    # Fit the ARIMA model
    model = ARIMA(yearly_students['students'], order=(5, 1, 0))
    model_fit = model.fit()

    # Forecast for 2023-2032
    forecast_index = pd.date_range(start=yearly_students.index[-1] + pd.DateOffset(years=1),
                                   periods=forecast_steps, freq='Y')
    forecast = model_fit.forecast(steps=forecast_steps)

    # Store the forecast results
    for i in range(len(forecast)):
        forecast_list.append({
            'field_of_study': field,
            'year': forecast_index[i].year,
            'forecasted_students': forecast[i]
        })

    # Plotting the forecast for each field_of_study
    plt.figure(figsize=(12, 6))
    plt.plot(yearly_students['students'], label='Historical Data')
    plt.plot(forecast_index, forecast, label='Forecast (2023-2032)', color='green', linestyle='--')
    plt.xlabel('Year')
    plt.ylabel('Number of Students')
    plt.title(f'ARIMA Forecast for {field} (2023-2032)')
    plt.legend()
    plt.grid(True)

    # Save the plot as a PNG image
    plot_filename = f"{output_dir}/{field.replace(' ', '_')}_forecast.png"
    plt.savefig(plot_filename)
    plt.close()

# Convert the results to a DataFrame
forecast_df = pd.DataFrame(forecast_list)

# Save the forecasted values to a CSV file
output_file = 'forecast_2023_2032_by_field.csv'
forecast_df.to_csv(output_file, index=False)

print(f"Forecast saved to {output_file}")
print(f"Plots saved in '{output_dir}' directory.")


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  forecast_index = pd.date_range(start=yearly_students.index[-1] + pd.DateOffset(years=1),
  'forecasted_students': forecast[i]
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  forecast_index = pd.date_range(start=yearly_students.index[-1] + pd.DateOffset(years=1),
  'forecasted_students': forecast[i]
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  forecast_index = pd.date_range(start=yearly_students.index[-1] + pd.DateOffset(years=1),
  'forecasted_students': forecast[i]
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  forecast_index = pd.date_range(start=yearly_students.index[-1] + pd.DateOffset(years=1),
  'forecasted_students': forecast[i]
  self._init_dates(dates, freq)
  self._init_

Forecast saved to forecast_2023_2032_by_field.csv
Plots saved in 'field_forecast_plots' directory.
