# Desafio DS - Lighthouse - P02

## 1. Understanding how to work

### 1.1 Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pmdarima.arima import auto_arima
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.api import SimpleExpSmoothing

### 1.2 Describing the file

In [None]:
pd.set_option("display.max_columns", 10)
pd.set_option("display.max_rows", 10)

df_original = pd.read_csv("df_final_with_no_na_values.csv",index_col='Year', parse_dates=True)
df_original.index.name = None
df_original

In [None]:
df_original.iloc[:-5,:].describe().round()

### 1.3 Creating a random list
I decided to create a random list to better see what is happing with the series.

In [None]:
import random

# random_columns_list = [random.randint(0,227) for _ in range(10)]
# print(random_columns_list)
# OUTPUT
random_list = [1, 8, 23, 33, 55, 86, 92, 100, 150, 214]
random_list_columns = df_original.iloc[:1,random_list].columns
random_list_columns

#### 1.3.1 Plot with seasonal decompose

In [None]:
def test_seasonal_decompose(df,list_columns):

    for column in list_columns:
        resultado = seasonal_decompose(df[column], period=10)
        fig = plt.figure(figsize=(8, 6))  
        fig = resultado.plot()

test_seasonal_decompose(df_original,random_list_columns)

## 2.  Creating functions

### 2.2 Metrics

In [None]:
def metrics(y_true, y_pred, list_name, model_name):

    dicio = []
    for i in list_name:

        y_true_2 = y_true[[i]].values
        y_pred_2 = y_pred[[i]].values


        mae = round(np.mean(np.abs((y_true_2 - y_pred_2) / y_true_2)) * 100, 2)
        rmse = round(np.sqrt(mean_squared_error(y_true_2, y_pred_2)), 2)
        mape = round(mean_absolute_percentage_error(y_true_2, y_pred_2), 2)

        dicio.append({'Country':i,'Model': model_name,'mae': mae, 'rmse': rmse, 'mape': mape})
    
    metrics = pd.DataFrame(dicio)
    return metrics

### 2.3 Adfuller

This function tests the Augmented Dickey-Fuller test using a sample of countries and creates two lists: one for countries with stationary time series and another for countries with non-stationary time series

In [None]:
def test_adfuller(df,sample_list):
    list_stationary = []
    list_non_stationary = []

    print("Result of p-value to sample data \n")
    
    for column in sample_list:
        result = adfuller(df_original[column].iloc[:-5])
        print(column)
        print(f'ADF: {result[0]}')
        print(f'p-value: {result[1]}')
        print('-'*50)
    
    for column in df.columns:
        result = adfuller(df_original[column].iloc[:-5])
        if result[1] <= 0.05:
            list_stationary.append(column)
        else:
            list_non_stationary.append(column)
    
    stationary_total = len(list_stationary)
    stationary_p = (stationary_total/len(df.columns)*100)

    non_stationary_total = len(list_non_stationary)
    non_stationary_p = (non_stationary_total/len(df.columns)*100)
    
    print(f"\nTotal of stationary is {stationary_total} and it represents {stationary_p} %")
    print(f"Total of Non stationary is {non_stationary_total} and it represents {non_stationary_p} %")
    
    return (list_stationary,list_non_stationary)

### 2.4 Auto ARIMA Model

In [None]:
def auto_arima_model(df_train, list_columns, trend= None, d=None, stationary=True, seasonal=True):

    df_prediction = pd.DataFrame()

    for column in list_columns:
        model_sarimax = auto_arima(
            df_train[column],
            start_p=0, max_p=2, 
            start_q=0, max_q=2,
            # start_P=0, max_P=2,
            # start_Q=0, max_Q=2,
            m=10, 
            seasonal=seasonal,
            stationary=stationary,
            d=d, 
            trend= trend,
            test = 'adf',
            error_acCtion='ignore', 
            stepwise=True
        )

        pred_sarimax = model_sarimax.predict(df_test.shape[0])
        
        df_prediction = pd.concat([df_prediction, pd.DataFrame(pred_sarimax,columns=[column])], axis=1).round()

    return pred_sarimax

### 2.5 Simple Exponential Smoothing Model

#### 2.5.1 Smoothing level

This function finds the better smoothing level for each country

In [None]:
def find_smoothing_level(df_train, df_test, columns):
    smoothing_list = []

    for i in columns:
        
        best_mse = float('inf')
        best_smoothing_level = None
        smoothing_levels = np.linspace(0.01, 1, 10)

        for smoothing_level in smoothing_levels:
            model = SimpleExpSmoothing(df_train[i]).fit(smoothing_level=smoothing_level)
            forecast = model.forecast(steps=len(df_test[i]))
            mse = mean_squared_error(df_test[i], forecast)
            
            if mse < best_mse:
                best_mse = mse
                best_smoothing_level = smoothing_level

        smoothing_list.append({'Country':i,'best_smoothing_level': best_smoothing_level})

    smoothing_level = pd.DataFrame(smoothing_list)

    return smoothing_level

#### 2.5.2 Model 

In [None]:
def simple_smoothing(df_train, df_test, smoothing_level):

    df_prediction = pd.DataFrame()
    
    for i in smoothing_level.index:
        column = smoothing_level['Country'].iloc[i]
        # print(column)
        model_ses = SimpleExpSmoothing(
            df_train[column],
            initialization_method='estimated', #heurustic                             
        ).fit(smoothing_level=smoothing_level['best_smoothing_level'].iloc[i], optimized=True)
        
        pred_ses = model_ses.forecast(df_test.shape[0])

        df_prediction = pd.concat([df_prediction, pd.DataFrame(pred_ses,columns=[column])], axis=1).round()

    simple_smoothing_pred = pd.DataFrame(df_prediction)
    
    return simple_smoothing_pred

### 2.6 Plot Test and Prediction

In [None]:
def plot_test_pred(df_train,df_test,df_pred,random_columns_list, model_type):
    columns = df_pred.iloc[:1,random_columns_list].columns
    
    for i in columns:
        fig, axs = plt.subplots(figsize=(8, 3))

        axs.plot(df_train[i].index, df_train[i])
        axs.plot(df_pred[i].index, df_pred[i])
        axs.plot(df_test[i].index, df_test[i])
        axs.legend(['Train','Pred','Test'])
        axs.set_title(f'Model {model_type} - {i}')
        plt.show()

## 3. Applying models

### 3.1 Train series and Test series

In [None]:
df_train = df_original.iloc[:-5].copy()
df_test = df_original.iloc[44:].copy()

### 3.2 List of stationary and no stationary series

To find the list of countries that are stationary and no stationary and also testing de ADF with a sample

In [None]:
list_stationary,list_non_stationary = test_adfuller(df_train,random_list_columns)

In [None]:
list_stationary

In [None]:
list_non_stationary

#### 3.2.1 Creating a random list with stationary series and non stationary list

In [None]:
# random_stationary_list = [random.randint(0,193) for _ in range(10)]
# random_non_stationary_list = [random.randint(0,33) for _ in range(10)]

# print(random_stationary_list)
# print(random_non_stationary_list)

# OUTPUT
random_stationary_list = [4, 15, 22, 58, 113, 131, 150, 181, 188, 189]
random_non_stationary_list = [3, 4, 11, 11, 16, 24, 25, 29, 30, 33]

### 3.3 Auto Arima

#### 3.3.1 Model Auto ARIMA

##### A) Stationary

In [None]:
df_pred_auto_arima_stationary = auto_arima_model(df_train, list_stationary)
df_pred_auto_arima_stationary

##### B) Non Stationary

In [None]:
df_pred_auto_arima_non_stationary = auto_arima_model(df_train, list_non_stationary, stationary = False, d=1)
df_pred_auto_arima_non_stationary

#### 3.3.2 Metrics Auto ARIMA

##### A) Stationary

In [None]:
df_metric_auto_arima_stationary = metrics(df_test, df_pred_auto_arima_stationary,list_stationary,'Auto ARIMA')
df_metric_auto_arima_stationary

##### B) Non Stationary

In [None]:
df_metric_auto_arima_non_stationary = metrics(df_test, df_pred_auto_arima_non_stationary,list_non_stationary,'Auto ARIMA')
df_metric_auto_arima_non_stationary

##### C) Merge both metrics

In [None]:
df_metrics_auto_arima = pd.concat([df_metric_auto_arima_non_stationary,df_metric_auto_arima_stationary])
df_metrics_auto_arima

#### 3.3.3 Plots the prediction for some random series

plot_test_pred(df_train ,df_test,df_pred_auto_arima_stationary,random_stationary_list, 'Auto Arima')

In [None]:
plot_test_pred(df_train ,df_test,df_pred_auto_arima_non_stationary,random_non_stationary_list,'SARIMA')

### 3.4 Simple Exponential Smoothing

#### 3.4.1 Smoothing level
Find smoothing level for each country

##### A) Stationary

In [None]:
smoothing_level_stationary = find_smoothing_level(df_train, df_test, list_stationary)

In [None]:
smoothing_level_stationary

##### B) Non Stationary

In [None]:
smoothing_level_non_stationary = find_smoothing_level(df_train, df_test, list_non_stationary)

In [None]:
smoothing_level_non_stationary

#### 3.4.2 Model Simple Exponential Smoothing

##### A) Stationary

In [None]:
df_pred_simple_smoothing_stationary =simple_smoothing(df_train, df_test, smoothing_level_stationary)
df_pred_simple_smoothing_stationary

##### B) Non Stationary

In [None]:
df_pred_simple_smoothing_non_stationary =simple_smoothing(df_train, df_test, smoothing_level_non_stationary)
df_pred_simple_smoothing_non_stationary

#### 3.4.3 Metrics Simple Exponential Smoothing

##### A) Stationary

In [None]:
df_metric_simple_smoothing_stationary = metrics(df_test, df_pred_simple_smoothing_stationary,list_stationary,'Simple Smoothing')
df_metric_simple_smoothing_stationary

##### B) Non Stationary

In [None]:
df_metric_simple_smoothing_non_stationary = metrics(df_test, df_pred_simple_smoothing_non_stationary,list_non_stationary,'Simple Smoothing')
df_metric_simple_smoothing_non_stationary

##### C) Merge both metrics

In [None]:
df_metrics_simple_smoothing = pd.concat([df_metric_simple_smoothing_non_stationary,df_metric_simple_smoothing_stationary])
df_metrics_simple_smoothing

#### 3.4.4 Plots the prediction for some random series

In [None]:
plot_test_pred(df_train ,df_test,df_pred_simple_smoothing_stationary,random_stationary_list, 'Simple Smoothing')

In [None]:
plot_test_pred(df_train ,df_test,df_pred_simple_smoothing_non_stationary,random_non_stationary_list,'Simple Smoothing')

## 4. Choosing the model

### 4.1 Creating a function to compare both models

In [None]:
def compare_and_calculate_percentages(df_a, df_b,column):
    if len(df_a) != len(df_b):
        raise ValueError("DataFrames must have the same number of rows.")

    total_rows = len(df_a)
    count_a_greater = 0
    count_b_greater = 0
    count_equal = 0

    for value_a, value_b in zip(df_a.iloc[:, column], df_b.iloc[:, column]):
        # print(df_a.iloc[index:index+1,-2:-1])
        if value_a > value_b:
            count_a_greater += 1
        elif value_a < value_b:
            count_b_greater += 1
        else:
            count_equal += 1

    percent_a_greater = (count_a_greater / total_rows) * 100
    percent_b_greater = (count_b_greater / total_rows) * 100
    percent_equal = (count_equal / total_rows) * 100

    print(f"Percentage of times A was greater than B: {percent_a_greater:.2f}%")
    print(f"Percentage of times B was greater than A: {percent_b_greater:.2f}%")
    print(f"Percentage of times A and B were equal: {percent_equal:.2f}%")

### 4.2 Comparing the models

In [None]:
compare_and_calculate_percentages(df_metrics_auto_arima, df_metrics_simple_smoothing, column=-1)
print('-'*60)
compare_and_calculate_percentages(df_metrics_auto_arima, df_metrics_simple_smoothing, column=-2)

## 5. Final model

Since simple exponential smoothing yielded better parameters compared to SARIMA, I have decided to use it as the final model for predicting the series.

#### 5.1 Concat the stationary and non stationary

In [None]:
final_pred_model = pd.concat([df_pred_simple_smoothing_non_stationary,df_pred_simple_smoothing_stationary], axis = 1)
final_pred_model

#### 5.2 Reorganizing the df

In [None]:
first_column= final_pred_model.iloc[:, 0]

remaining_columns= final_pred_model.iloc[:, 1:].reindex(sorted(final_pred_model.columns[1:]), axis=1)

df_result_pred = pd.concat([first_column, remaining_columns], axis=1)

df_result_pred 

#### 5.3 Merge train and test

In [None]:
df_final = pd.concat([df_train,df_result_pred])

#### 5.4 Reindex and transpose

In [None]:
df_final.reset_index()
df_final.index = df_final.index.year

df_final = df_final.reset_index()
df_final.rename(columns ={'index':'Real GDP growth (Annual percent change)'}, inplace= True)

df_final

In [None]:
df_final_transpose = df_final.set_index('Real GDP growth (Annual percent change)').transpose().reset_index(names=['Real GDP growth (Annual percent change)']).rename_axis('', axis=1).round(2)
df_final_transpose

## 6. Saving to csv

In [None]:
df_final_transpose.to_csv('files/predicted.csv', index = False)