# Regresión Polinómica

# Setup

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Regresión polinómica

¿Y si los datos son más complejos que una línea recta? Sorprendentemente, puede utilizar un modelo lineal para ajustar datos no lineales. 

Una forma sencilla de hacerlo es añadir potencias de cada característica como nuevas características y, a continuación, entrenar un modelo lineal en este conjunto ampliado de características. Esta técnica se denomina regresión polinómica.

[Ejemplo](https://ostwalprasad.github.io/machine-learning/Polynomial-Regression-using-statsmodel.html)

In [4]:
salaries = pd.read_csv("data/Salary_Data.csv")

In [5]:
salaries.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [6]:
salaries.columns = [c.lower().replace(" ", "") for c in salaries.columns]
salaries.head()

Unnamed: 0,age,gender,educationlevel,jobtitle,yearsofexperience,salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [8]:
salaries['gender'].unique()

array(['Male', 'Female', nan, 'Other'], dtype=object)

In [10]:
salaries.dropna(inplace=True)

In [13]:
salaries['educationlevel'].unique()

array(['Bachelor', 'Master', 'PhD', 'High School'], dtype=object)

In [11]:
salaries['gender'] = salaries['gender'].map({"Male": 0, "Female": 1})

In [12]:
salaries["educationlevel"] = salaries["educationlevel"].replace("Bachelor's", "Bachelor")
salaries["educationlevel"] = salaries["educationlevel"].replace("Master's", "Master")
salaries["educationlevel"] = salaries["educationlevel"].replace("Bachelor's Degree", "Bachelor")
salaries["educationlevel"] = salaries["educationlevel"].replace("Master's Degree", "Master")
salaries["educationlevel"] = salaries["educationlevel"].replace("phD", "PhD")

In [14]:
dum = pd.get_dummies(salaries['educationlevel'], dtype=int)
dum.head()

Unnamed: 0,Bachelor,High School,Master,PhD
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,1,0,0,0
4,0,0,1,0


In [15]:
dum.drop(columns=['High School'], inplace=True)

In [16]:
salaries = pd.merge(salaries, dum, right_index=True, left_index=True)

In [17]:
salaries.head()

Unnamed: 0,age,gender,educationlevel,jobtitle,yearsofexperience,salary,Bachelor,Master,PhD
0,32.0,0.0,Bachelor,Software Engineer,5.0,90000.0,1,0,0
1,28.0,1.0,Master,Data Analyst,3.0,65000.0,0,1,0
2,45.0,0.0,PhD,Senior Manager,15.0,150000.0,0,0,1
3,36.0,1.0,Bachelor,Sales Associate,7.0,60000.0,1,0,0
4,52.0,0.0,Master,Director,20.0,200000.0,0,1,0


In [19]:
salaries.dropna(inplace=True)


In [20]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le tiene que añadir una columna de 1s para el intercept del modelo

# La convenciones determinan que las features se llamen X 
# Además, tendremos habitualmente un conjunto de train y otro de test (con un split del 80/20, habitualmente)
# Por lo tanto, X_train contiene las features del train
# Este caso es tan sencillo y con pocos datos que no tenemos X_test, pero aparecerá
X_train = salaries[['age', 'gender', 'yearsofexperience', 'Bachelor', 'Master', 'PhD']]

# Añadimos una columna de 1 porque necesitamos una constante que determine el intercept o B0, el punto donde corta la recta con el eje Y
# Por eso añadimos una columna constante de 0s
# Veremos que estoy aparece en muchos otros modelos (en redes neuronales se llama bias)
X_train = sm.add_constant(X_train, prepend=True)
# OLS: creamos el modelo del tipo Ordinary Least Squares (por alguna razón creo que una vez dije Optimal pero es Ordinary)
modelo = sm.OLS(endog=salaries['salary'], exog=X_train)
# Ajustamos el modelo, es decir, hacemos el calculo de la mejor recta posible según los criterios del OLS
modelo = modelo.fit()
print(modelo.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.718
Model:                            OLS   Adj. R-squared:                  0.717
Method:                 Least Squares   F-statistic:                     2829.
Date:                Wed, 08 May 2024   Prob (F-statistic):               0.00
Time:                        20:39:04   Log-Likelihood:                -77942.
No. Observations:                6684   AIC:                         1.559e+05
Df Residuals:                    6677   BIC:                         1.559e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              7.932e+04   3587.08

In [43]:
modelo.resid.mean()

-1.0723307202634236e-09

# Interacción entre variables

¿Y si queremos analizar la interacción entre variables? 
Ejemplo: ¿es lo mismo un año adicional de experiencia con un Grado vs Master vs PhD?

In [23]:
X_train = salaries[['salary', 'age', 'gender', 'yearsofexperience', 'Bachelor', 'Master', 'PhD']]

In [25]:
model = "salary ~ age + gender + yearsofexperience + Bachelor + Master + PhD"
model = smf.ols(formula = model, data = X_train)
model = model.fit()

In [26]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.718
Model:                            OLS   Adj. R-squared:                  0.717
Method:                 Least Squares   F-statistic:                     2829.
Date:                Wed, 08 May 2024   Prob (F-statistic):               0.00
Time:                        20:56:18   Log-Likelihood:                -77942.
No. Observations:                6684   AIC:                         1.559e+05
Df Residuals:                    6677   BIC:                         1.559e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          7.932e+04   3587.08

In [28]:
model = "salary ~ age + I(age**2)  + gender + yearsofexperience + Bachelor + Master + PhD"
model = smf.ols(formula = model, data = X_train)
model = model.fit()

In [29]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.726
Model:                            OLS   Adj. R-squared:                  0.725
Method:                 Least Squares   F-statistic:                     2523.
Date:                Wed, 08 May 2024   Prob (F-statistic):               0.00
Time:                        20:59:54   Log-Likelihood:                -77846.
No. Observations:                6684   AIC:                         1.557e+05
Df Residuals:                    6676   BIC:                         1.558e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept         -1.587e+04   7692.91