In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [28]:
datos = pd.read_csv('cal_housing.csv')
datos.head()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [29]:
datos.shape
datos.info()
datos.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   longitude         20640 non-null  float64
 1   latitude          20640 non-null  float64
 2   housingMedianAge  20640 non-null  float64
 3   totalRooms        20640 non-null  float64
 4   totalBedrooms     20640 non-null  float64
 5   population        20640 non-null  float64
 6   households        20640 non-null  float64
 7   medianIncome      20640 non-null  float64
 8   medianHouseValue  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.898014,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.247906,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,295.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [30]:
X = datos.drop('medianHouseValue',axis=1)
y= datos['medianHouseValue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True, random_state=0)

## Linear Regression

In [31]:
#Linear Regression 
ols = LinearRegression()
ols.fit(X_train,y_train)
y_pred= ols.predict(X_test)

r2_score_lin=r2_score(y_test,y_pred)
mse_lin=mean_squared_error(y_test,y_pred)

lr=['Lineal', mse_lin, r2_score_lin]

print("R2_score: "+str(r2_score_lin))
print("Mse: "+str(mse_lin))

R2_score: 0.6277645980446469
Mse: 4853781771.947943


# Polynomial of 2nd grade

In [32]:
model = PolynomialFeatures(2)
entre = model.fit_transform(X_train)

ols_2 = LinearRegression()

ols_2.fit(entre,y_train)

y_pred= ols_2.predict(model.transform(X_test))

r2_score_lin2=r2_score(y_test,y_pred)
mse_lin2=mean_squared_error(y_test,y_pred)

p2= ['Polinomial de grado 2',mse_lin2,r2_score_lin2 ]

print("R2_score: "+str(r2_score_lin2))
print("Mse: "+str(mse_lin2))

R2_score: 0.6874007820650956
Mse: 4076152826.859839


# Polynomial of 2nd grade with standard scaler

In [33]:
std_scaler = StandardScaler()
poly = PolynomialFeatures(2)

#Primero se transforman los datos cuadráticos y luego se escalan XD
x_transform = poly.fit_transform(X_train)
x_transform = std_scaler.fit_transform(x_transform)

x_transform_t = poly.fit_transform(X_test)
x_transform_t = std_scaler.fit_transform(x_transform_t)

model = LinearRegression()
model.fit(x_transform,y_train)
y_pred = model.predict(x_transform_t)
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

p2_s=['Polinomial de grado 2 con escalamiento estándar', mse , r2]
print("R2_score: "+str(r2))
print("Mse: "+str(mse))

R2_score: 0.6846554790037427
Mse: 4111950340.711749


## Polynomial of 2nd grade with robust scaler

In [34]:
robust_scaler = RobustScaler()
poly = PolynomialFeatures(2)
x_transform_rbst = poly.fit_transform(X_train)
x_transform_rbst = robust_scaler.fit_transform(x_transform_rbst)

x_transform_t_rbst = poly.fit_transform(X_test)
x_transform_t_rbst = robust_scaler.fit_transform(x_transform_t_rbst)

model = LinearRegression()
model.fit(x_transform_rbst,y_train)
y_pred = model.predict(x_transform_t_rbst)
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

p2_r= ['Polinomial de grado 2 con escalamiento robusto', mse, r2]
print("R2_score: "+str(r2))
print("Mse: "+str(mse))


R2_score: 0.612907332494666
Mse: 5047513814.436843


# Polynomial of 3rd grade

In [40]:
poly_3=PolynomialFeatures(3)
x_transform = poly_3.fit_transform(X_train)
X_tran_t=poly_3.fit_transform(X_test)

model = LinearRegression()
model.fit(x_transform,y_train)

y_pred = model.predict(X_tran_t)

mse = mean_squared_error(y_test,y_pred)
r2= r2_score(y_test,y_pred)
p3=['Polinomnial de grado3 3', mse ,r2]
print("R2_score: "+str(r2))
print("Mse: "+str(mse))

R2_score: 0.6820135837271035
Mse: 4146399463.6855087


# Polynomial of 3rd grade with standard scaler

In [36]:
std_scaler_3 = StandardScaler()
poly_3 = PolynomialFeatures(3)

x_transform_3= poly_3.fit_transform(X_train)
x_transform_3 = std_scaler_3.fit_transform(x_transform_3)

x_transform_t3 = poly_3.fit_transform(X_test)
x_transform_t3 = std_scaler_3.fit_transform(x_transform_t3)

model = LinearRegression()
model.fit(x_transform_3,y_train)
y_pred = model.predict(x_transform_t3)
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

p3_s=['Polinomial de grado 3 con escalamiento estándar', mse ,r2]
print("R2_score: "+str(r2))
print("Mse: "+str(mse))

R2_score: 0.3203722151488695
Mse: 8862039818.059643


# Polynomial of 3rd grade with robust scaler

In [37]:
robust_scaler_3 = RobustScaler()
poly_3 = PolynomialFeatures(3)
x_transform_rbst_3 = poly_3.fit_transform(X_train)
x_transform_rbst_3 = robust_scaler_3.fit_transform(x_transform_rbst_3)

x_transform_t_rbst_3 = poly_3.fit_transform(X_test)
x_transform_t_rbst_3 = robust_scaler_3.fit_transform(x_transform_t_rbst_3)

model = LinearRegression()
model.fit(x_transform_rbst_3,y_train)
y_pred = model.predict(x_transform_t_rbst_3)
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

p3_r=['Polinomial de grado 3 con escalamiento robusto', mse,r2]
print("R2_score: "+str(r2))
print("Mse: "+str(mse))

R2_score: -664.9850968312896
Mse: 8684145318817.417


In [38]:
metricas=[lr,p2,p2_s,p2_r,p3,p3_s,p3_r]
metricas2=pd.DataFrame(metricas, columns=['Model','Mse','r2'])
metricas2

Unnamed: 0,Model,Mse,r2
0,Lineal,4853782000.0,0.627765
1,Polinomial de grado 2,4076153000.0,0.687401
2,Polinomial de grado 2 con escalamiento estándar,4111950000.0,0.684655
3,Polinomial de grado 2 con escalamiento robusto,5047514000.0,0.612907
4,Polinomnial de grado3 3,4146399000.0,0.682014
5,Polinomial de grado 3 con escalamiento estándar,8862040000.0,0.320372
6,Polinomial de grado 3 con escalamiento robusto,8684145000000.0,-664.985097
