In [44]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
#pip install pmdarima

In [46]:
cd "/content/drive/My Drive/Colab Notebooks/Predicta/Time_Series"

/content/drive/My Drive/Colab Notebooks/Predicta/Time_Series


In [47]:
import pandas as pd
import numpy as np
from math import sqrt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings

warnings.filterwarnings("ignore")

# Load data
dataf = pd.read_csv("historical_weather.csv")

# Handle missing values (if any)
dataf['avg_temp_c'] = dataf['avg_temp_c'].fillna(method='ffill') # Forward fill missing temperature values

In [48]:
dataf.head()

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh
0,C001,2014-01-01,6.6,-1.4,11.6,,,168.0,6.2
1,C001,2014-01-02,9.3,6.3,13.3,,,155.0,10.0
2,C001,2014-01-03,7.6,1.9,14.0,,,,5.8
3,C001,2014-01-04,7.6,3.9,13.3,,,291.0,11.3
4,C001,2014-01-05,8.6,0.5,16.9,,,,5.0


In [49]:
# Create a dictionary to store the DataFrames
city_dfs = {}

# Get unique city IDs
unique_cities = dataf['city_id'].unique()

# Create a DataFrame for each city
for city_id in unique_cities:
    # Filter the main DataFrame for the current city
    city_df = dataf[dataf['city_id'] == city_id].copy()

    # Set the 'date' column as the index (optional but often useful for time series analysis)
    city_df.set_index('date', inplace=True)

    # Store the city DataFrame in the dictionary
    city_dfs[city_id] = city_df

In [50]:
# Function to perform ADF test
def adf_test(dataset):
    dftest = adfuller(dataset, autolag = 'AIC')

In [51]:
# Lists to store results
all_predictions = []
submission_data = []
rmse_values = []
avg_rmse = 0

In [52]:
# Iterate through each city
for city_id in unique_cities:
    print(f"Processing city: {city_id}")  # Progress indicator
    df = city_dfs[city_id]

    #ADF test
    adf_test(df['avg_temp_c'])

    stepwise_fit = auto_arima(df['avg_temp_c'], suppress_warnings=True)

    # Train ARIMA for RMSE test and train
    train=df.iloc[:-7]
    test=df.iloc[-7:]
    model=ARIMA(train['avg_temp_c'],order=(1,0,5))
    model=model.fit()
    start=len(train)
    end=len(train)+len(test)-1
    index_future_dates=pd.date_range(start='2018-12-25',end='2018-12-31')
    pred=model.predict(start=start,end=end,typ='levels').rename('ARIMA predictions')

    # Calculate RMSE with test data
    rmse=sqrt(mean_squared_error(pred,test['avg_temp_c']))
    rmse_values.append(rmse)
    avg_rmse = sum(rmse_values) / len(rmse_values)
    print(f"{city_id} : {rmse} : {avg_rmse}")

    # Model two for real prediction
    model2=ARIMA(df['avg_temp_c'],order=(1,0,5))
    model2=model2.fit()
    index_future_dates=pd.date_range(start='2019-01-01',end='2019-01-07')
    pred=model2.predict(start=len(df),end=len(df)+6,typ='levels').rename('ARIMA Predictions')
    pred.index=index_future_dates

    # Store predictions and create submission format
    all_predictions.append(pred)
    for date, temp in pred.items():
        submission_data.append({
            "submission_ID": len(submission_data) + 1,
            "city_id": city_id,
            "date": date,
            "avg_temp_c": temp
        })



Processing city: C001
C001 : 0.8718346511051867 : 0.8718346511051867
Processing city: C002
C002 : 1.7500709045455083 : 1.3109527778253476
Processing city: C003
C003 : 0.736779284756369 : 1.1195616134690214
Processing city: C004
C004 : 4.189291458185542 : 1.8869940746481515
Processing city: C005
C005 : 2.2655329840392646 : 1.9627018565263739
Processing city: C007
C007 : 5.027891638454883 : 2.473566820181125
Processing city: C008
C008 : 2.442659845904999 : 2.4691515381416784
Processing city: C009
C009 : 3.6570513622242746 : 2.617639016152003
Processing city: C010
C010 : 2.5079418977555847 : 2.60545044744129
Processing city: C011
C011 : 3.540124473098344 : 2.6989178500069952
Processing city: C012
C012 : 3.5333703490093673 : 2.77477716809812
Processing city: C013
C013 : 1.5954044123564692 : 2.6764961051196487
Processing city: C014
C014 : 2.6428300134232146 : 2.673906405758385
Processing city: C015
C015 : 0.6324285439496914 : 2.5280865584863355
Processing city: C016
C016 : 0.835270550106338

In [53]:
# Create final DataFrames for submission
submission_df = pd.DataFrame(submission_data)
all_predictions_df = pd.concat(all_predictions, axis=1)
all_predictions_df.columns = unique_cities  # Set city IDs as column names

# Save to CSV files
submission_df[['submission_ID', 'avg_temp_c']].to_csv("sample_submission.csv", index=False)
submission_df.to_csv("submission_key.csv", index=False)
all_predictions_df.to_csv("all_city_predictions.csv")