In [None]:
# Standard library imports
import sys
import datetime as datetime
# Third-party library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import matplotlib.pylab
from matplotlib import pyplot
import seaborn as sns
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose
# Pandas-specific import
from pandas import Series

In [None]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.api import VAR
import pandas as pd
import numpy as np
from google.colab import files

# Prompt for file upload
uploaded = files.upload()

# Read the uploaded CSV file into a pandas DataFrame
for filename in uploaded.keys():
    print(f'User uploaded file "{filename}" with length {len(uploaded[filename])} bytes')
    data = pd.read_csv(filename)

# Load and preprocess the dataset
def preprocess_dataset(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Combine LocalDate and LocalTime into a single datetime column
    df['datetime'] = pd.to_datetime(df['LocalDate'] + ' ' + df['LocalTime'])

    # Extract relevant columns
    df = df[['datetime', 'lat', 'long']]
    df['day'] = df['datetime'].dt.day
    df['month'] = df['datetime'].dt.month
    df['year'] = df['datetime'].dt.year
    df['time_numeric'] = df['datetime'].dt.hour * 3600 + df['datetime'].dt.minute * 60 + df['datetime'].dt.second

    return df

# Linear Regression Implementation
def run_linear_regression(dropped_df):
    meanCVrmse = []

    # Independent variables (time feature) and dependent variables (lat, lon, date, time_numeric)
    time_feature = dropped_df[['day', 'month', 'year']].values
    outputs = dropped_df[['lat', 'long', 'day', 'time_numeric']].values

    # Cross-validation
    tscv = TimeSeriesSplit(n_splits=5)

    for train_index, test_index in tscv.split(time_feature):
        time_train, time_test = time_feature[train_index], time_feature[test_index]
        outputs_train, outputs_test = outputs[train_index], outputs[test_index]

        # Train model for all outputs
        linear_regressor = LinearRegression()
        linear_regressor.fit(time_train, outputs_train)

        # Predict and compute RMSE for all outputs
        outputs_pred = linear_regressor.predict(time_test)
        rmse = np.sqrt(mean_squared_error(outputs_test, outputs_pred, multioutput='raw_values'))
        meanCVrmse.append(rmse)

    # Compute average RMSE across folds
    avg_rmse = np.mean(meanCVrmse, axis=0)

    return avg_rmse

# Polynomial Regression Implementation
def run_polynomial_regression(dropped_df):
    meanCVrmse = []

    # Independent variables (time feature) and dependent variables (lat, lon, date, time_numeric)
    time_feature = dropped_df[['day', 'month', 'year']].values
    outputs = dropped_df[['lat', 'long', 'day', 'time_numeric']].values

    # Polynomial features
    poly = PolynomialFeatures(degree=4, include_bias=False)
    time_poly = poly.fit_transform(time_feature)

    # Cross-validation
    tscv = TimeSeriesSplit(n_splits=5)

    for train_index, test_index in tscv.split(time_poly):
        time_train, time_test = time_poly[train_index], time_poly[test_index]
        outputs_train, outputs_test = outputs[train_index], outputs[test_index]

        # Train model for all outputs
        pol_reg = LinearRegression()
        pol_reg.fit(time_train, outputs_train)

        # Predict and compute RMSE for all outputs
        outputs_pred = pol_reg.predict(time_test)
        rmse = np.sqrt(mean_squared_error(outputs_test, outputs_pred, multioutput='raw_values'))
        meanCVrmse.append(rmse)

    # Compute average RMSE across folds
    avg_rmse = np.mean(meanCVrmse, axis=0)

    return avg_rmse

# VAR Implementation
def run_var_model(dropped_df):
    obs = 1435
    train, test = dropped_df[['lat', 'long', 'day', 'time_numeric']][:-obs], dropped_df[['lat', 'long', 'day', 'time_numeric']][-obs:]

    # Initialize VAR model
    model = VAR(train)

    # Fit VAR model
    model_fitted = model.fit(2)

    # Forecasting
    forecast_input = train.values[-model_fitted.k_ar:]
    fc = model_fitted.forecast(y=forecast_input, steps=obs)

    # Postprocess predictions
    dailyVARPredictions = pd.DataFrame(
        fc,
        index=dropped_df.index[-obs:],
        columns=['Predicted Lat', 'Predicted Lon', 'Predicted Day', 'Predicted Time']
    )
    dailyVARPredictions['Actual Lat'] = test['lat'].values
    dailyVARPredictions['Actual Lon'] = test['long'].values
    dailyVARPredictions['Actual Day'] = test['day'].values
    dailyVARPredictions['Actual Time'] = test['time_numeric'].values

    # Compute RMSE for all outputs
    rmse = [
        np.sqrt(mean_squared_error(dailyVARPredictions[f'Actual {col}'], dailyVARPredictions[f'Predicted {col}']))
        for col in ['Lat', 'Lon', 'Day', 'Time']
    ]

    return rmse

# Main function to run the models
def main(file_path):
    # Preprocess the dataset
    dropped_df = preprocess_dataset(file_path)

    # Run models
    linear_results = run_linear_regression(dropped_df)
    polynomial_results = run_polynomial_regression(dropped_df)
    var_results = run_var_model(dropped_df)

    # Display results
    print("Linear Regression Results (Lat RMSE, Lon RMSE, Day RMSE, Time RMSE):", linear_results)
    print("Polynomial Regression Results (Lat RMSE, Lon RMSE, Day RMSE, Time RMSE):", polynomial_results)
    print("VAR Model Results (Lat RMSE, Lon RMSE, Day RMSE, Time RMSE):", var_results)

# Run the script (provide the path to the CSV file)
main(filename)


Saving Seri.csv to Seri (1).csv
User uploaded file "Seri (1).csv" with length 884978 bytes
Linear Regression Results (Lat RMSE, Lon RMSE, Day RMSE, Time RMSE): [1.23034893e-01 1.64084080e-01 6.98353016e-15 2.51866275e+04]
Polynomial Regression Results (Lat RMSE, Lon RMSE, Day RMSE, Time RMSE): [2.39565328e+00 1.05025864e+00 7.19627869e-06 4.83988128e+05]
VAR Model Results (Lat RMSE, Lon RMSE, Day RMSE, Time RMSE): [0.22158249008289918, 0.08944050867078301, 8.689443127906408, 25092.627768192582]
