In [1]:
import yfinance as yf

stock = "NFLX"
ticker = yf.Ticker(stock)
data = ticker.history(start="2013-01-01", end="2023-12-31", interval="1mo")["Close"]

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import scipy as sp

In [3]:
data.tail()

Date
2023-08-01 00:00:00-04:00    433.679993
2023-09-01 00:00:00-04:00    377.600006
2023-10-01 00:00:00-04:00    411.690002
2023-11-01 00:00:00-04:00    473.970001
2023-12-01 00:00:00-05:00    486.880005
Name: Close, dtype: float64

In [4]:
training_mask = data.index > '2023-01-01'

In [5]:
test = data[training_mask]
train = data[~training_mask]
horizon = len(test)

In [6]:
test.head()

Date
2023-02-01 00:00:00-05:00    322.130005
2023-03-01 00:00:00-05:00    345.480011
2023-04-01 00:00:00-04:00    329.929993
2023-05-01 00:00:00-04:00    395.230011
2023-06-01 00:00:00-04:00    440.489990
Name: Close, dtype: float64

In [7]:
from statsforecast.models import SeasonalNaive

In [8]:
model_sn = SeasonalNaive(season_length=12)
model_sn = model_sn.fit(y=train.values)
y_hat_dict = model_sn.predict(h=horizon)

y_hat_dict['mean']

array([394.52, 374.59, 190.36, 197.44, 174.87, 224.9 , 223.56, 235.44,
       291.88, 305.53, 294.88], dtype=float32)

In [9]:
model_sn.predict_in_sample()

{'fitted': array([       nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan,        nan,        nan,
               nan,        nan,  23.605715,  26.86857 ,  27.04    ,
         30.867144,  32.32143 ,  30.155714,  34.925713,  40.55857 ,
         44.172855,  46.06857 ,  52.25714 ,  52.595715,  58.475716,
         63.66143 ,  50.29    ,  46.005714,  59.69    ,  62.942856,
         60.388573,  68.23428 ,  64.454285,  56.11    ,  49.512856,
         48.80143 ,  63.114285,  67.844284,  59.52714 ,  79.5     ,
         89.15143 ,  93.84857 , 114.31    , 115.03    , 103.26    ,
        108.38    , 123.33    , 114.38    ,  91.84    ,  93.41    ,
        102.23    ,  90.03    , 102.57    ,  91.48    ,  91.25    ,
         97.45    ,  98.55    , 124.87    , 117.      , 123.8     ,
        140.71    , 142.13    , 147.81    , 152.2     , 163.07    ,
        149.41    , 181.66    , 174.71    , 181.35    , 196.43    ,
        187.58    , 191.96    , 270.3 

In [10]:
residuals_train = train.values[12:] - model_sn.predict_in_sample()['fitted'][12:]

In [11]:
residuals_test = test.values - y_hat_dict['mean']

In [12]:
rmse_trian = np.sqrt(np.mean(residuals_train**2))
rmse_test = np.sqrt(np.mean(residuals_test**2))

In [13]:
tabla_modelos = pd.DataFrame(columns=['Modelo', 'RMSE Train', 'RMSE Test', 'Parametros'], data = [['Seasonal Naive', rmse_trian, rmse_test, 0]])
     

In [14]:
tabla_modelos

Unnamed: 0,Modelo,RMSE Train,RMSE Test,Parametros
0,Seasonal Naive,134.981988,171.89285,0


In [15]:
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(train, model='additive')
print(result.trend)
print(result.seasonal)
print(result.resid)
print(result.observed)

Date
2013-01-01 00:00:00-05:00   NaN
2013-02-01 00:00:00-05:00   NaN
2013-03-01 00:00:00-05:00   NaN
2013-04-01 00:00:00-04:00   NaN
2013-05-01 00:00:00-04:00   NaN
                             ..
2022-09-01 00:00:00-04:00   NaN
2022-10-01 00:00:00-04:00   NaN
2022-11-01 00:00:00-04:00   NaN
2022-12-01 00:00:00-05:00   NaN
2023-01-01 00:00:00-05:00   NaN
Name: trend, Length: 121, dtype: float64
Date
2013-01-01 00:00:00-05:00     1.049447
2013-02-01 00:00:00-05:00     4.918894
2013-03-01 00:00:00-05:00    -0.748118
2013-04-01 00:00:00-04:00   -14.894011
2013-05-01 00:00:00-04:00   -11.101724
                               ...    
2022-09-01 00:00:00-04:00     5.144389
2022-10-01 00:00:00-04:00     7.596010
2022-11-01 00:00:00-04:00     3.423345
2022-12-01 00:00:00-05:00     2.307501
2023-01-01 00:00:00-05:00     1.049447
Name: seasonal, Length: 121, dtype: float64
Date
2013-01-01 00:00:00-05:00   NaN
2013-02-01 00:00:00-05:00   NaN
2013-03-01 00:00:00-05:00   NaN
2013-04-01 00:00:00-04:

In [16]:
tendencia = result.trend

In [17]:
tendencia = tendencia.dropna()

In [18]:
tendencia.tail()

Date
2022-03-01 00:00:00-05:00    380.410002
2022-04-01 00:00:00-04:00    348.187918
2022-05-01 00:00:00-04:00    317.571251
2022-06-01 00:00:00-04:00    290.740833
2022-07-01 00:00:00-04:00    274.872499
Name: trend, dtype: float64

In [19]:
from statsforecast.models import RandomWalkWithDrift

In [20]:
model__stl_drift = RandomWalkWithDrift()
model__stl_drift = model__stl_drift.fit(y=tendencia.values)
y_hat_dict_trend = model__stl_drift.predict(h=horizon)

y_hat_dict_trend['mean']
len(model__stl_drift.predict_in_sample()["fitted"])

109

In [21]:
season = result.seasonal

In [22]:
model_sn = SeasonalNaive(season_length=12)
model_sn = model_sn.fit(y=season.values)
y_hat_dict_season = model_sn.predict(h=horizon)

y_hat_dict_season['mean']
season_train_predict = model_sn.predict_in_sample()["fitted"]

In [23]:
season_train_predict = season_train_predict[~np.isnan(season_train_predict)]

In [24]:
len(season_train_predict)

109

In [25]:
predict_stl = model__stl_drift.predict_in_sample()["fitted"] + season_train_predict

In [26]:
y_hat = y_hat_dict_trend['mean'] + y_hat_dict_season['mean']

In [27]:
predict_stl[0] = train.values[12]

In [28]:
residuals_train = train.values[12:] - predict_stl
residuals_test = test.values - y_hat
rmse_train = np.sqrt(np.mean(residuals_train**2))
rmse_test = np.sqrt(np.mean(residuals_test**2))

In [29]:
rmse_test, rmse_train

(126.16859449163962, 91.40422566270085)

In [30]:
tabla_modelos = pd.concat([tabla_modelos,
           pd.DataFrame(columns=['Modelo', 'RMSE Train', 'RMSE Test', 'Parametros'],
                        data = [['STL', rmse_train, rmse_test, 0]])])

In [31]:
stock = 'MXN=X'
ticker = yf.Ticker(stock)
usdmxn = ticker.history(start= '1993-01-01', end= '2023-12-31', interval='1mo')['Close']

usdmxn.tail()

Date
2023-08-01 00:00:00+01:00    16.757130
2023-09-01 00:00:00+01:00    17.403900
2023-10-01 00:00:00+01:00    18.048100
2023-11-01 00:00:00+00:00    17.282801
2023-12-01 00:00:00+00:00    16.956200
Name: Close, dtype: float64

In [32]:
data = data["2003":]

In [33]:
usdmxn.index = usdmxn.index.strftime('%Y-%m')
usdmxn.index = pd.to_datetime(usdmxn.index)
usdmxn.head()

Date
2003-12-01    11.191
2004-01-01    11.048
2004-02-01    11.055
2004-03-01    11.105
2004-04-01    11.352
Name: Close, dtype: float64

In [34]:
data = pd.DataFrame(data.values, columns=['Y'], index=data.index)
data.head()

Unnamed: 0_level_0,Y
Date,Unnamed: 1_level_1
2013-01-01 00:00:00-05:00,23.605715
2013-02-01 00:00:00-05:00,26.86857
2013-03-01 00:00:00-05:00,27.040001
2013-04-01 00:00:00-04:00,30.867144
2013-05-01 00:00:00-04:00,32.32143


In [35]:
data.head(24)

Unnamed: 0_level_0,Y
Date,Unnamed: 1_level_1
2013-01-01 00:00:00-05:00,23.605715
2013-02-01 00:00:00-05:00,26.86857
2013-03-01 00:00:00-05:00,27.040001
2013-04-01 00:00:00-04:00,30.867144
2013-05-01 00:00:00-04:00,32.32143
2013-06-01 00:00:00-04:00,30.155714
2013-07-01 00:00:00-04:00,34.925713
2013-08-01 00:00:00-04:00,40.558571
2013-09-01 00:00:00-04:00,44.172855
2013-10-01 00:00:00-04:00,46.068569


In [36]:
data['mes'] = data.index.month_name()
data.head()

Unnamed: 0_level_0,Y,mes
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-01 00:00:00-05:00,23.605715,January
2013-02-01 00:00:00-05:00,26.86857,February
2013-03-01 00:00:00-05:00,27.040001,March
2013-04-01 00:00:00-04:00,30.867144,April
2013-05-01 00:00:00-04:00,32.32143,May


In [37]:
data = pd.get_dummies(data, columns=['mes'],
                      prefix="", prefix_sep="",
                      drop_first=True, dtype=float)

In [38]:
fecha_2008 = pd.Series(data = [1],
                       index=pd.to_datetime(["2008-11-01"]),
                       name='2008_outlier')
fecha_2020 = pd.Series(data = [1],
                       index=pd.to_datetime(["2020-03-01"]),
                       name='2020_outlier')

In [39]:
data = data.fillna(0)

In [40]:
data['lag1'] = data['Y'].shift(1)
data['lag2'] = data['Y'].shift(2)
data['lag3'] = data['Y'].shift(3)
data['lag4'] = data['Y'].shift(4)
data['lag5'] = data['Y'].shift(5)
data['lag6'] = data['Y'].shift(6)
data['lag7'] = data['Y'].shift(7)
data['lag8'] = data['Y'].shift(8)
data['lag9'] = data['Y'].shift(9)
data['lag10'] = data['Y'].shift(10)
data['lag11'] = data['Y'].shift(11)
data['lag12'] = data['Y'].shift(12)    

In [41]:
data.columns

Index(['Y', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'lag1', 'lag2',
       'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10',
       'lag11', 'lag12'],
      dtype='object')

In [42]:
training_mask = data.index > '2023-01-01'

In [43]:
test = data[training_mask]
train = data[~training_mask]
horizon = len(test)

In [44]:
horizon

11

In [45]:
train = train["2004":]

In [46]:
from sklearn.linear_model import LinearRegression

In [48]:
train_cleaned = train.dropna()
y_train_rl = LinearRegression().fit(train_cleaned.drop(columns=['Y']),
                                    train_cleaned['Y']).predict(train_cleaned.drop(columns=['Y']))

In [49]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X = train.drop(columns=['Y'])
y = train['Y']

X_imputed = imputer.fit_transform(X)

# Train model
y_train_rl = LinearRegression().fit(X_imputed, y).predict(X_imputed)

In [50]:
from sklearn.ensemble import HistGradientBoostingRegressor

X = train.drop(columns=['Y'])
y = train['Y']

y_train_rl = HistGradientBoostingRegressor().fit(X, y).predict(X)

In [53]:
test_cleaned = test.dropna()
y_test_rl = LinearRegression().fit(test_cleaned.drop(columns=['Y']),
                                    test_cleaned['Y']).predict(test_cleaned.drop(columns=['Y']))

In [54]:
imputer = SimpleImputer(strategy='mean')
X = test.drop(columns=['Y'])
y = test['Y']

X_imputed = imputer.fit_transform(X)

y_test_rl = LinearRegression().fit(X_imputed, y).predict(X_imputed)

In [56]:
X = test.drop(columns=['Y'])
y = test['Y']

y_test_rl = HistGradientBoostingRegressor().fit(X, y).predict(X)

In [57]:
residuals_test = test['Y'].values - y_test_rl
residuals_train = train['Y'].values - y_train_rl
rmse_train = np.sqrt(np.mean(residuals_train**2))
rmse_test = np.sqrt(np.mean(residuals_test**2))

In [58]:
rmse_test, rmse_train

(53.62638264627138, 21.149620590160087)

In [59]:
len(train.columns) + 1

25

In [60]:
tabla_modelos = pd.concat([tabla_modelos,
           pd.DataFrame(columns=['Modelo', 'RMSE Train', 'RMSE Test', 'Parametros'],
                        data = [['RL', rmse_train, rmse_test, 28]])])

In [61]:
from sklearn.linear_model import LassoCV

In [63]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV

# Separate features and target
X_train = train.drop(columns=['Y'])
y_train = train['Y']

# Impute missing values
imputer = SimpleImputer(strategy='mean')  # Replace NaNs with the mean of each column
X_train_imputed = imputer.fit_transform(X_train)

# Fit the LassoCV model
lasso_cv = LassoCV(cv=5, random_state=0)
lasso_cv.fit(X_train_imputed, y_train)

# Evaluate the model
lasso_score = lasso_cv.score(X_train_imputed, y_train)
print(f"LassoCV Score: {lasso_score}")

LassoCV Score: 0.953244043708771


In [64]:
# Drop rows with missing values
train_cleaned = train.dropna()

# Separate features and target
X_train_cleaned = train_cleaned.drop(columns=['Y'])
y_train_cleaned = train_cleaned['Y']

# Fit the LassoCV model
lasso_cv = LassoCV(cv=5, random_state=0)
lasso_cv.fit(X_train_cleaned, y_train_cleaned)

# Evaluate the model
lasso_score = lasso_cv.score(X_train_cleaned, y_train_cleaned)
print(f"LassoCV Score: {lasso_score}")

LassoCV Score: 0.9618581650631767


In [68]:
tabla_modelos

Unnamed: 0,Modelo,RMSE Train,RMSE Test,Parametros
0,Seasonal Naive,134.981988,171.89285,0
0,STL,91.404226,126.168594,0
0,RL,21.149621,53.626383,28
