# Regresión Polinómica

# Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Regresión polinómica

¿Y si los datos son más complejos que una línea recta? Sorprendentemente, puede utilizar un modelo lineal para ajustar datos no lineales. 

Una forma sencilla de hacerlo es añadir potencias de cada característica como nuevas características y, a continuación, entrenar un modelo lineal en este conjunto ampliado de características. Esta técnica se denomina regresión polinómica.

[Ejemplo](https://ostwalprasad.github.io/machine-learning/Polynomial-Regression-using-statsmodel.html)

In [3]:
salaries = pd.read_csv("data/Salary_Data.csv")

In [4]:
salaries.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [5]:
salaries.columns = [c.lower().replace(" ", "") for c in salaries.columns]

In [6]:
salaries.head()

Unnamed: 0,age,gender,educationlevel,jobtitle,yearsofexperience,salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [13]:
salaries["gender"].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [11]:
salaries.dropna(inplace=True)

In [34]:
salaries.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6698 entries, 0 to 6703
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                6698 non-null   float64
 1   gender             6684 non-null   float64
 2   educationlevel     6698 non-null   object 
 3   jobtitle           6698 non-null   object 
 4   yearsofexperience  6698 non-null   float64
 5   salary             6698 non-null   float64
 6   Bachelor           6698 non-null   int32  
 7   Master             6698 non-null   int32  
 8   PhD                6698 non-null   int32  
dtypes: float64(4), int32(3), object(2)
memory usage: 444.8+ KB


In [14]:
salaries["gender"] = salaries["gender"].map({"Male":0, "Female":1})

In [16]:
salaries["educationlevel"].unique()

array(["Bachelor's", "Master's", 'PhD', "Bachelor's Degree",
       "Master's Degree", 'High School', 'phD'], dtype=object)

In [20]:
salaries["educationlevel"] = salaries["educationlevel"].replace("Bachelor's", "Bachelor")
salaries["educationlevel"] = salaries["educationlevel"].replace("Master's", "Master")
salaries["educationlevel"] = salaries["educationlevel"].replace("Bachelor's Degree", "Bachelor")
salaries["educationlevel"] = salaries["educationlevel"].replace("Master's Degree", "Master")
salaries["educationlevel"] = salaries["educationlevel"].replace("phD", "PhD")


In [26]:
dum = pd.get_dummies(salaries["educationlevel"], dtype=int)
dum.head()

Unnamed: 0,Bachelor,High School,Master,PhD
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,1,0,0,0
4,0,0,1,0


In [28]:
dum.drop(columns=["High School"], inplace=True)

In [29]:
dum

Unnamed: 0,Bachelor,Master,PhD
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,1,0
...,...,...,...
6699,0,0,1
6700,0,0,0
6701,1,0,0
6702,0,1,0


In [30]:
salaries

Unnamed: 0,age,gender,educationlevel,jobtitle,yearsofexperience,salary
0,32.0,0.0,Bachelor,Software Engineer,5.0,90000.0
1,28.0,1.0,Master,Data Analyst,3.0,65000.0
2,45.0,0.0,PhD,Senior Manager,15.0,150000.0
3,36.0,1.0,Bachelor,Sales Associate,7.0,60000.0
4,52.0,0.0,Master,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,1.0,PhD,Director of Marketing,20.0,200000.0
6700,32.0,0.0,High School,Sales Associate,3.0,50000.0
6701,30.0,1.0,Bachelor,Financial Manager,4.0,55000.0
6702,46.0,0.0,Master,Marketing Manager,14.0,140000.0


In [31]:
salaries = pd.merge(salaries, dum, right_index=True, left_index=True)

In [32]:
salaries.head()

Unnamed: 0,age,gender,educationlevel,jobtitle,yearsofexperience,salary,Bachelor,Master,PhD
0,32.0,0.0,Bachelor,Software Engineer,5.0,90000.0,1,0,0
1,28.0,1.0,Master,Data Analyst,3.0,65000.0,0,1,0
2,45.0,0.0,PhD,Senior Manager,15.0,150000.0,0,0,1
3,36.0,1.0,Bachelor,Sales Associate,7.0,60000.0,1,0,0
4,52.0,0.0,Master,Director,20.0,200000.0,0,1,0


In [35]:
# Gender Other no se ha codificado y tiene NaN
# Lo borramos
salaries.dropna(inplace=True)

In [36]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le tiene que añadir una columna de 1s para el intercept del modelo

# La convenciones determinan que las features se llamen X 
# Además, tendremos habitualmente un conjunto de train y otro de test (con un split del 80/20, habitualmente)
# Por lo tanto, X_train contiene las features del train
# Este caso es tan sencillo y con pocos datos que no tenemos X_test, pero aparecerá
X_train = salaries[["age", "gender", "yearsofexperience", "Bachelor", "Master", "PhD"]]

# Añadimos una columna de 1 porque necesitamos una constante que determine el intercept o B0, el punto donde corta la recta con el eje Y
# Por eso añadimos una columna constante de 0s
# Veremos que estoy aparece en muchos otros modelos (en redes neuronales se llama bias)
X_train = sm.add_constant(X_train, prepend=True)
# OLS: creamos el modelo del tipo Ordinary Least Squares (por alguna razón creo que una vez dije Optimal pero es Ordinary)
modelo = sm.OLS(endog=salaries["salary"], exog=X_train)
# Ajustamos el modelo, es decir, hacemos el calculo de la mejor recta posible según los criterios del OLS
modelo = modelo.fit()
print(modelo.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.718
Model:                            OLS   Adj. R-squared:                  0.717
Method:                 Least Squares   F-statistic:                     2829.
Date:                Wed, 08 May 2024   Prob (F-statistic):               0.00
Time:                        20:38:59   Log-Likelihood:                -77942.
No. Observations:                6684   AIC:                         1.559e+05
Df Residuals:                    6677   BIC:                         1.559e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              7.932e+04   3587.08

In [43]:
modelo.resid.mean()

-1.0723307202634236e-09

# Interacción entre variables

¿Y si queremos analizar la interacción entre variables? 
Ejemplo: ¿es lo mismo un año adicional de experiencia con un Grado vs Master vs PhD?

In [47]:
X_train = salaries[["salary", "age", "gender", "yearsofexperience", "Bachelor", "Master", "PhD"]]

In [48]:
model = "salary ~ age + gender + yearsofexperience + Bachelor + Master + PhD"
model = smf.ols(formula = model, data = X_train)
model = model.fit()

In [49]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.718
Model:                            OLS   Adj. R-squared:                  0.717
Method:                 Least Squares   F-statistic:                     2829.
Date:                Wed, 08 May 2024   Prob (F-statistic):               0.00
Time:                        20:55:27   Log-Likelihood:                -77942.
No. Observations:                6684   AIC:                         1.559e+05
Df Residuals:                    6677   BIC:                         1.559e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          7.932e+04   3587.08

In [64]:
model = "salary ~ age + + gender + yearsofexperience + I(yearsofexperience ** 2) + I(yearsofexperience * gender)   + I(yearsofexperience * Bachelor)+ I(yearsofexperience * Master)+ I(yearsofexperience * PhD) + Bachelor + Master + PhD"
model = smf.ols(formula = model, data = X_train)
model = model.fit()

In [65]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.773
Model:                            OLS   Adj. R-squared:                  0.772
Method:                 Least Squares   F-statistic:                     2064.
Date:                Wed, 08 May 2024   Prob (F-statistic):               0.00
Time:                        21:17:29   Log-Likelihood:                -77215.
No. Observations:                6684   AIC:                         1.545e+05
Df Residuals:                    6672   BIC:                         1.545e+05
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 