<a href="https://colab.research.google.com/github/OctavioGMoran/IAyAA/blob/main/AnalisisBombish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant

# Load the data
file_path = 'BD_Reg.xlsx'
data = pd.read_excel(file_path)

# Transforming numeric variables using natural logarithm
# Adding a small constant to avoid log(0)
numeric_vars = data.columns[2:-1]  # Excluding 'Entidad', 'Año', and 'Sum_Correo_NSS'
data[numeric_vars] = np.log(data[numeric_vars] + 1)

# Handling categorical variables ('Año' and 'Entidad') using OneHotEncoding
data['Año'] = pd.Categorical(data['Año'])
data['Entidad'] = pd.Categorical(data['Entidad'])

# Updating the variables of interest to include Total_NRP
variables_of_interest = ['L_2', 'Total_NRP', 'Sum_CorreoNRP', 'Sum_InfEmpresa', 'Sum_Otros', 'Sum_Reconocimiento', 'Sum_Correo_NSS']

# Preparing the panel data with the necessary variables
panel_data = data.set_index(['Entidad', 'Año'])
panel_data = panel_data[variables_of_interest + ['Total_Cred']]
panel_data = panel_data.astype(float)
panel_data = add_constant(panel_data)

# Create dummy variables for each entity (fixed effects)
entity_dummies = pd.get_dummies(panel_data.index.get_level_values('Entidad'))

# Add the entity dummies to the data
panel_data_fe = pd.concat([panel_data.reset_index(), entity_dummies], axis=1)

# Drop one of the dummies to avoid the dummy variable trap (perfect multicollinearity)
panel_data_fe = panel_data_fe.drop(entity_dummies.columns[0], axis=1)

# Fit the Fixed Effects model
fe_model_ols = OLS(panel_data_fe['Total_Cred'], panel_data_fe.drop(['Total_Cred', 'Entidad', 'Año'], axis=1))
fe_results_ols = fe_model_ols.fit()

# Display the results
print(fe_results_ols.summary())


                            OLS Regression Results                            
Dep. Variable:             Total_Cred   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                     2329.
Date:                Sat, 03 Feb 2024   Prob (F-statistic):           5.79e-37
Time:                        00:46:12   Log-Likelihood:                 166.78
No. Observations:                  64   AIC:                            -255.6
Df Residuals:                      25   BIC:                            -171.4
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -5.1043    

In [None]:
# Filtering variables based on p-value
significant_vars = wls_results.pvalues[wls_results.pvalues < 0.05].index.tolist()

# Prepare the simplified model data
X_simplified = panel_data_fe[significant_vars]

# Fit the simplified model
simplified_model = OLS(panel_data_fe['Total_Cred'], X_simplified)
simplified_results = simplified_model.fit()

# Display the results of the simplified model
print(simplified_results.summary())


                            OLS Regression Results                            
Dep. Variable:             Total_Cred   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                     2638.
Date:                Sat, 03 Feb 2024   Prob (F-statistic):           2.38e-40
Time:                        00:59:38   Log-Likelihood:                 166.58
No. Observations:                  64   AIC:                            -259.2
Df Residuals:                      27   BIC:                            -179.3
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -4.8601    

In [None]:
from sklearn.linear_model import LassoCV

# Preparing the data for Lasso regression
X = panel_data_fe.drop(['Total_Cred', 'Entidad', 'Año'], axis=1)
y = panel_data_fe['Total_Cred']

# LassoCV for automatic alpha selection
lasso_cv = LassoCV(cv=5, random_state=42, max_iter=10000).fit(X, y)

# Coefficients and selected features
lasso_coefficients = pd.Series(lasso_cv.coef_, index=X.columns)

# Displaying the coefficients
print("Lasso coefficients:\n", lasso_coefficients)
print("\nSelected features by Lasso (non-zero coefficients):\n", lasso_coefficients[lasso_coefficients != 0])


Lasso coefficients:
 const                  0.000000
L_2                    0.357634
Total_NRP              0.000000
Sum_CorreoNRP         -0.000000
Sum_InfEmpresa        -0.000000
Sum_Otros             -0.000000
Sum_Reconocimiento    -0.000000
Sum_Correo_NSS         0.000297
Baja California        0.000000
Baja California Sur   -0.000000
Campeche              -0.000000
Chiapas               -0.000000
Chihuahua              0.000000
Ciudad de México       0.000000
Coahuila               0.000000
Colima                -0.000000
Durango                0.000000
Estado de México       0.000000
Guanajuato             0.000000
Guerrero              -0.000000
Hidalgo               -0.000000
Jalisco                0.000000
Michoacán             -0.000000
Morelos               -0.000000
Nayarit               -0.000000
Nuevo León             0.000000
Oaxaca                -0.000000
Puebla                 0.000000
Querétaro              0.000000
Quintana Roo          -0.000000
San Luis Potosí    

In [None]:
from statsmodels.regression.linear_model import WLS

# Assume the variance of the residuals from the OLS model can be used as weights
weights = 1 / (fe_results_ols.resid**2)

# Fit the WLS model
wls_model = WLS(panel_data_fe['Total_Cred'], panel_data_fe.drop(['Total_Cred', 'Entidad', 'Año'], axis=1), weights=weights)
wls_results = wls_model.fit()

# Display the results
print(wls_results.summary())


                            WLS Regression Results                            
Dep. Variable:             Total_Cred   R-squared:                       1.000
Model:                            WLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 4.605e+05
Date:                Sat, 03 Feb 2024   Prob (F-statistic):           1.16e-65
Time:                        00:50:48   Log-Likelihood:                 229.91
No. Observations:                  64   AIC:                            -381.8
Df Residuals:                      25   BIC:                            -297.6
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -5.0248    

In [None]:
from sklearn.linear_model import RidgeCV

# RidgeCV for automatic alpha selection
ridge_cv = RidgeCV(alphas=np.logspace(-6, 6, 13)).fit(X, y)

# Coefficients
ridge_coefficients = pd.Series(ridge_cv.coef_, index=X.columns)

# Displaying the coefficients
print("Ridge coefficients:\n", ridge_coefficients)


Ridge coefficients:
 const                  0.000000
L_2                    0.614092
Total_NRP              1.177690
Sum_CorreoNRP          0.017959
Sum_InfEmpresa        -0.004849
Sum_Otros             -0.003940
Sum_Reconocimiento    -0.009687
Sum_Correo_NSS        -0.000037
Baja California       -0.520746
Baja California Sur    0.473688
Campeche               1.036298
Chiapas                0.737486
Chihuahua             -0.462928
Ciudad de México      -1.625098
Coahuila              -0.512760
Colima                 0.251951
Durango                0.442624
Estado de México      -0.996486
Guanajuato            -0.738014
Guerrero               0.808982
Hidalgo                0.357289
Jalisco               -1.416353
Michoacán             -0.200204
Morelos                0.829241
Nayarit                0.783703
Nuevo León            -1.386252
Oaxaca                 0.935608
Puebla                -0.327071
Querétaro             -0.444719
Quintana Roo          -0.242673
San Luis Potosí    

In [None]:
from sklearn.linear_model import LassoCV

# Prepare the data
X = panel_data_fe.drop(['Total_Cred', 'Entidad', 'Año'], axis=1)
y = panel_data_fe['Total_Cred']

# LassoCV for automatic alpha (regularization strength) selection
lasso_cv = LassoCV(cv=5, random_state=42, max_iter=10000).fit(X, y)

# Coefficients and selected features
lasso_coefficients = pd.Series(lasso_cv.coef_, index=X.columns)

# Displaying the coefficients
print("Lasso coefficients:\n", lasso_coefficients)
print("\nSelected features by Lasso (non-zero coefficients):\n", lasso_coefficients[lasso_coefficients != 0])


Lasso coefficients:
 const                  0.000000
L_2                    0.357634
Total_NRP              0.000000
Sum_CorreoNRP         -0.000000
Sum_InfEmpresa        -0.000000
Sum_Otros             -0.000000
Sum_Reconocimiento    -0.000000
Sum_Correo_NSS         0.000297
Baja California        0.000000
Baja California Sur   -0.000000
Campeche              -0.000000
Chiapas               -0.000000
Chihuahua              0.000000
Ciudad de México       0.000000
Coahuila               0.000000
Colima                -0.000000
Durango                0.000000
Estado de México       0.000000
Guanajuato             0.000000
Guerrero              -0.000000
Hidalgo               -0.000000
Jalisco                0.000000
Michoacán             -0.000000
Morelos               -0.000000
Nayarit               -0.000000
Nuevo León             0.000000
Oaxaca                -0.000000
Puebla                 0.000000
Querétaro              0.000000
Quintana Roo          -0.000000
San Luis Potosí    

In [None]:
from sklearn.linear_model import ElasticNetCV

# ElasticNetCV for automatic alpha and l1_ratio selection
elastic_net_cv = ElasticNetCV(cv=5, random_state=42, max_iter=10000).fit(X, y)

# Coefficients and selected features
elastic_net_coefficients = pd.Series(elastic_net_cv.coef_, index=X.columns)

# Displaying the coefficients
print("Elastic Net coefficients:\n", elastic_net_coefficients)
print("\nSelected features by Elastic Net (non-zero coefficients):\n", elastic_net_coefficients[elastic_net_coefficients != 0])


Elastic Net coefficients:
 const                  0.000000
L_2                    0.236244
Total_NRP              0.000000
Sum_CorreoNRP         -0.000000
Sum_InfEmpresa        -0.000000
Sum_Otros             -0.000000
Sum_Reconocimiento    -0.000000
Sum_Correo_NSS         0.000357
Baja California        0.000000
Baja California Sur   -0.000000
Campeche              -0.000000
Chiapas               -0.000000
Chihuahua              0.000000
Ciudad de México       0.000000
Coahuila               0.000000
Colima                -0.000000
Durango                0.000000
Estado de México       0.000000
Guanajuato             0.000000
Guerrero              -0.000000
Hidalgo               -0.000000
Jalisco                0.000000
Michoacán             -0.000000
Morelos               -0.000000
Nayarit               -0.000000
Nuevo León             0.000000
Oaxaca                -0.000000
Puebla                 0.000000
Querétaro              0.000000
Quintana Roo          -0.000000
San Luis Poto