In [10]:
import pandas as pd
from reg_lin_sim_mod import RLS, plots, webAppRegSimple, linearization, webAppCorrSimple, webAppCorrMultiple, webAppRegMultiple, global_functions, RegMultiple
from plotly import graph_objects as go
from plotly import express as px
import math
import scipy.stats as stats
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

df = pd.read_csv('ev_6_datos.csv')
df_mult = pd.read_csv('data_regresion_multiple.csv')
df_ex_3 = pd.read_csv('datos_examen_3.csv')

In [11]:
models = [webAppRegSimple.select_model(df, 'Linear'), webAppRegSimple.select_model(df, 'Power'),webAppRegSimple.select_model(df, 'Exponential'), webAppRegSimple.select_model(df, 'Logaritmic'), webAppRegSimple.select_model(df, 'Reciprocal')]

In [12]:
best_model = webAppRegSimple.best_model(df)
i = 2

In [13]:
print(best_model[0])

Best Model: Exponencial Model

Estimated Ecuation: y = 1.9525*e^(-0.0625*x)


In [14]:
best_model[1]

In [15]:
best_model[2]

Unnamed: 0,Model,Estimated Ec.,R^2,Performance Level,Residuals Var,Significance Test,CI b0,CI b1
0,Linear,y = 1.8026 + -0.0717*x,94.69%,Very Good,0.0066,"160.6198 > 5.1174, Significant Regression",1.6678 < b0 < 1.9374,-0.0845 < b1 < -0.0589
1,Power,y = 2.7127*x^-0.439,93.60%,Very Good,0.0059,"131.5666 > 5.1174, Significant Regression",0.8076 < b0 < 1.1883,-0.5256 < b1 < -0.3524
2,Exponencial,y = 1.9525*e^(-0.0625*x),97.98%,Very Good,0.0019,"436.5287 > 5.1174, Significant Regression",0.5979 < b0 < 0.7404,-0.0693 < b1 < -0.0557
3,Logaritmic,y = 2.2207 + -0.5232*ln(x),97.49%,Very Good,0.0031,"349.6249 > 5.1174, Significant Regression",2.0815 < b0 < 2.3599,-0.5865 < b1 < -0.4599
4,Reciprocal,y = x/(1.2116*x--1.6278),63.58%,Moderate,0.0279,"15.7109 > 5.1174, Significant Regression",1.0314 < b0 < 1.3918,-2.5569 < b1 < -0.6988


In [16]:
best_model[3]

In [17]:
print(f"Estimated Ecuation:\n{models[i]['estimated_ec']}")

Estimated Ecuation:
y = 1.9525*e^(-0.0625*x)


In [18]:
print(models[i]['significance_test'])

436.5287 > 5.1174, Significant Regression


In [19]:
print(f"R^2 = {models[i]['r_square']*100:.2f}%\nPerformance Level: {models[2]['performance_level']}")

R^2 = 97.98%
Performance Level: Very Good


In [21]:
print(f"Residuals Variance = {models[i]['residuals_variance']:.4f}")

Residuals Variance = 0.0019


In [22]:
models[i]['model_graph']

In [23]:
print(models[i]['coef_intervals'])

Coeficient Condidence Intervals:
0.5979 < b0 0.7404
-0.0693 < b1 -0.0557


In [24]:
models[i]['anova_table']

Unnamed: 0,source_of_variation,degrees_of_freedom,sum_of_squares,median_square,F0
0,Regression,1,0.809926,0.809926,436.5287
1,Residuals,9,0.016698,0.001855,
2,Total,10,0.826624,,


In [25]:
## ASSUMPTIONS
# Constant Variance
models[i]['constant_variance_plot']

In [26]:
# Normal Distribution 0 Mean
print(models[i]['Normal_0_mean'])

0.1471 < 3.8415, Residuals come from a Normal distribution with mean 0


In [27]:
models[i]['qqplot']

In [28]:
# INCORRELATION TEST
models[i]['incorrelation_test']

'2.416267254872919 > 1.32409, Incorrelated data.'

In [29]:
# ATIPICAL DATA
models[i]['atypical_data']

'There is not atypical data in the sample'

In [30]:
webAppCorrSimple.correlation_plot(df)

In [31]:
print(webAppCorrSimple.correlation(df))

Correlation: -0.9731085097890455
Very Strong Negative Correlation


In [32]:
# MULTIPLE REGRESSION
# Correlation Matrix Table
webAppCorrMultiple.correlation_matrix_table(df_ex_3)

Unnamed: 0,Flujo total de calor,insolación,posición sur,posición norte,hora del día
Flujo total de calor,1.0,0.601731,0.146075,-0.865168,-0.263171
insolación,0.601731,1.0,-0.365896,-0.682359,-0.707204
posición sur,0.146075,-0.365896,1.0,0.238851,0.703914
posición norte,-0.865168,-0.682359,0.238851,1.0,0.564725
hora del día,-0.263171,-0.707204,0.703914,0.564725,1.0


In [33]:
# Correlation Matrix Heatmap
webAppCorrMultiple.correlation_matrix_heatmap(df_ex_3)

In [34]:
# Model Estimated Ecuation
model = webAppRegMultiple.mult_reg_model(df_ex_3)
print(model['model_data']['est_ecuation'])

1. Flujo total de calor
2. insolación
3. posición sur
4. posición norte
5. hora del día
Ŷ = 411.24+ 0.07X₁+ 3.29X₂-21.76X₃+ 2.06X₄


In [35]:
# Variability of the model (R^2)
print(f"Variability of the Model (R^2):\n{model['model_data']['r_square']*100:.2f}%")

Variability of the Model (R^2):
90.50%


In [36]:
# Performance of the Model (R^2 Adjusted)
print(f"Performance of the Model (R^2 Adjusted):\n{model['model_data']['r_square_adj']*100:.2f}%")

Performance of the Model (R^2 Adjusted):
87.96%


In [37]:
# Significant test for the regression
model['model_data']['significance_test']

'35.7086 > 3.0556, Significant Regression'

In [38]:
# MULTICOLLINEARITY
# If VIF > 10, it's recommended to eliminate the variable from the model
# Confidence intervals for the coefficients (b0,b1,..,bk)
# There is no evidence of regression to the origin, so we are not gonna run the model again
# There is not VIF > 10, so we can keep all the variables of the sample
webAppRegMultiple.coef_summary(df_ex_3)

1. Flujo total de calor
2. insolación
3. posición sur
4. posición norte
5. hora del día


Unnamed: 0,Term,Coef,95% CI,VIF
0,const,411.243265,249.3931 < b0 < 573.0934,1.007998
1,insolación,0.072922,-0.0069 < b1 < 0.1527,2.681162
2,posición sur,3.291436,0.6732 < b2 < 5.9097,2.192262
3,posición norte,-21.758034,-27.7031 < b3 < -15.813,1.976813
4,hora del día,2.055357,-1.6232 < b4 < 5.734,3.896055


In [39]:
# ANOVA table for the model
model['model_data']['anova_table']

Unnamed: 0,source_of_variation,degrees_of_freedom,sum_of_squares,half_square,f0
0,Regression,4,7086.794458,1771.698615,35.708641
1,Residuals,15,744.231042,49.615403,
2,Total,19,7831.0255,,


In [40]:
# Top Models
webAppRegMultiple.top_models(df_ex_3)

1. Flujo total de calor
2. insolación
3. posición sur
4. posición norte
5. hora del día


Unnamed: 0,Vars,insolación,posición sur,posición norte,hora del día,R sq,R sq (adj)
0,1,,,X,,74.8515,73.4544
1,1,X,,,,36.208,32.664
2,2,,X,X,,88.0455,86.639
3,2,,,X,X,82.3116,80.2306
4,3,X,X,X,,89.5978,87.6474
5,3,,X,X,X,88.0922,85.8594
6,4,X,X,X,X,90.4964,87.9621


In [41]:
# We decide that the best model in this sample is the one with 4 variables
# We are gonna run that model
best_model = webAppRegMultiple.mult_reg_model(df_ex_3)

1. Flujo total de calor
2. insolación
3. posición sur
4. posición norte
5. hora del día


In [42]:
# NORMAL PROBABILITY PLOT
# To ensure the normality of the residuals
# This is the visual way to see if the residuals come from a normal distribution
best_model['qqplot']

In [43]:
# RESIDUALS COMING FROM NORMAL DISTRIBUTION WITH MEAN 0 AND VARIANCE OF THE RESIDUALS
# This is a mathematical way to ensure the model meets the normal and mean 0 hypothesis
best_model['normal_0_mean_assumption']

'0.1498 < 5.9915, Residuals come from a Normal distribution with mean 0'

In [44]:
# A visual way to see if the residuals come from a normal distribution
# Histogram
best_model['normal_0_mean_hist']

In [45]:
# CONSTANT VARIANCE OF THE RESIDUALS
# VERUS FITS PLOT
best_model['versus_fit']

In [46]:
# INCORRELATED RESIDUALS
# VERSUS ORDER PLOT
best_model['versus_order']

In [47]:
# With durbin Watson's Critical Values we had an inconclusive test
# But by looking at the versus order plot, we do not observe patterns that give us evidence of correlation of the data
# So we can say that the residuals are incorrelated
best_model['incorrel_res_assumption']

'0.99755 < 1.6646405414680454 < 1.67634, Inconclusive test.'

In [48]:
# ATYPICAL DATA
best_model['atypical_data']

'There is not atypical data in the sample'

In [326]:
import pandas as pd
import statsmodels.api as sm

# Data
data = {
    'Flujo total de calor': [271.8, 264, 238.8, 230.7, 251.6, 257.9, 263.9, 266.5, 229.1, 239.3, 
                             258, 257.6, 267.3, 267, 259.6, 240.4, 227.2, 196, 278.7, 272.3],
    'Insolación': [783.35, 784.45, 684.45, 827.8, 860.45, 875.15, 909.45, 905.55, 756, 769.35, 
                   793.5, 801.65, 819.65, 808.55, 774.95, 711.85, 694.85, 638.1, 774.55, 757.9],
    'Posición sur': [40.75, 36.19, 37.31, 32.52, 33.71, 34.14, 34.85, 35.89, 33.53, 33.79, 
                     34.72, 35.22, 36.5, 37.6, 37.89, 37.71, 37, 36.76, 34.62, 35.4],
    'Posición norte': [16.66, 16.46, 17.66, 17.5, 16.4, 16.28, 16.06, 15.9, 16.6, 16.41, 
                       16.17, 15.92, 16.04, 16.19, 16.62, 17.37, 18.12, 18.53, 15.54, 15.7],
    'Hora del día': [13.2, 14.11, 15.68, 10.53, 11, 11.31, 11.96, 12.58, 10.66, 10.85, 
                     11.41, 11.91, 12.85, 13.58, 14.21, 15.56, 15.83, 16.41, 13.1, 13.63]
}

# Create DataFrame
df = pd.DataFrame(data)

# Add constant for intercept
X = sm.add_constant(df[['Insolación', 'Posición sur', 'Posición norte', 'Hora del día']])

# Fit the OLS (Ordinary Least Squares) regression model
model = sm.OLS(df['Flujo total de calor'], X).fit()

# Print ANOVA table
print(model.summary())

# Calculate standardized residuals
residuals = model.resid
leverage = model.get_influence().hat_matrix_diag
std_residuals = residuals / (model.mse_resid * (1 - leverage))**0.5

# Add standardized residuals to DataFrame
df['Standardized Residuals'] = std_residuals

# Print DataFrame with standardized residuals
print(df['Standardized Residuals'])


                             OLS Regression Results                             
Dep. Variable:     Flujo total de calor   R-squared:                       0.905
Model:                              OLS   Adj. R-squared:                  0.880
Method:                   Least Squares   F-statistic:                     35.71
Date:                  Tue, 11 Jun 2024   Prob (F-statistic):           1.68e-07
Time:                          23:47:35   Log-Likelihood:                -64.545
No. Observations:                    20   AIC:                             139.1
Df Residuals:                        15   BIC:                             144.1
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const            411.243