# Regresión lineal y polinómica

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
#import glob

### Serie de tiempo a Dataset.
Esta función transforma una serie de tiempo en un dataset, como vimos en clase: toma N valores y el N+1 lo vuelve una salida esperada.

In [2]:
def transformarSerieADataset(serie, elementosPorMuestra):
    dataset = None
    salidasDataset = None
    for counter in range (len(serie)-elementosPorMuestra-1):        
        muestra = np.array([serie[counter:counter+elementosPorMuestra]])        
        salida = np.array([serie[counter+elementosPorMuestra]])
        if dataset is None:
            dataset = muestra
        else:
            dataset = np.append(dataset,muestra,axis = 0)
        if salidasDataset is None:
            salidasDataset = salida    
        else:        
            salidasDataset = np.append(salidasDataset,salida)
    return dataset, salidasDataset

Intento de reunir diferentes marcas, obtuvo malas predicciones

In [None]:
df_prices = glob.glob("*.csv")
df_list = []
for filename in sorted(df_prices):
    df_list.append(pd.read_csv(filename))
full_df = pd.concat(df_list)

full_df.to_csv('df_prices.csv')

In [14]:
df_timeseries = pd.read_csv('bmw.csv')
df_timeseries.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0


In [15]:
df_timeseries = df_timeseries[['year','price']].groupby('year').mean()
df_timeseries.head()

Unnamed: 0_level_0,price
year,Unnamed: 1_level_1
1996,5995.0
1997,3950.0
1998,3950.0
1999,3285.0
2000,1624.5


In [5]:
df_timeseries['price']
serie = df_timeseries['price'].to_numpy()

Modifica la cantidad de elementos por muestra para ver si mas datos te ayudan a predecir mejor.

In [6]:
X, Y = transformarSerieADataset(serie, elementosPorMuestra = 7)

Modifica la cantidad de datos de entrenamiento y prueba para ver si mejora tu predicción.

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(10, 7)
(7, 7)
(10,)
(7,)


### Entrenamiento y predicciones:
Prueba que tal sirve al regresor lineal

In [17]:
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

LinearRegression()

In [18]:
y_train_predict = lin_model.predict(X_train)
MSE = mean_squared_error(Y_train,y_train_predict)
print("Entrenamiento: MSE ="+str(MSE))

y_test_predict = lin_model.predict(X_test)
MSE = (mean_squared_error(Y_test, y_test_predict))
print("Pruebas: MSE ="+str(MSE))

df_predicciones = pd.DataFrame({'valor real':Y_test, 'prediccion':y_test_predict, 'diferencia':Y_test-y_test_predict})
df_predicciones = df_predicciones.reset_index(drop = True)
df_predicciones.head(10)

Entrenamiento: MSE =244455.93164111915
Pruebas: MSE =9535842.50905499


Unnamed: 0,valor real,prediccion,diferencia
0,6240.043478,8921.508231,-2681.464753
1,8213.166667,4869.288403,3343.878263
2,11118.355742,10971.139243,147.216499
3,5390.166667,10834.386071,-5444.219404
4,13323.598802,11454.765837,1868.832966
5,31025.864275,27127.4384,3898.425875
6,15199.753796,15366.078558,-166.324762


### Entrenamiento y predicciones:
Prueba que tal sirve al regresor polinomial

In [19]:
poly_model = LinearRegression()
poly = PolynomialFeatures(degree=3)

Xpolytrain = poly.fit_transform(X_train)
Xpolytest = poly.fit_transform(X_test)

poly_model.fit(Xpolytrain, Y_train)
y_train_predict = poly_model.predict(Xpolytrain)

MSE = mean_squared_error(Y_train,y_train_predict)
print("Entrenamiento: MSE ="+str(MSE))

y_test_predict = poly_model.predict(Xpolytest)
MSE = (mean_squared_error(Y_test, y_test_predict))
print("Pruebas: MSE ="+str(MSE))

df_predicciones = pd.DataFrame({'valor_real':Y_test, 'prediccion':y_test_predict, 'diferencia':Y_test-y_test_predict})
df_predicciones = df_predicciones.reset_index(drop = True)
df_predicciones.head(10)

Entrenamiento: MSE =5.65791538986271e-23
Pruebas: MSE =6018475.652487328


Unnamed: 0,valor_real,prediccion,diferencia
0,6240.043478,5077.165204,1162.878274
1,8213.166667,9829.956056,-1616.789389
2,11118.355742,9942.59762,1175.758122
3,5390.166667,9723.34958,-4333.182913
4,13323.598802,10385.57227,2938.026532
5,31025.864275,33117.19204,-2091.327765
6,15199.753796,12964.020648,2235.733148
