In [None]:
import numpy as np
import pandas as pd

from joblib import dump, load

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler



import scipy.stats as stats

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
df_vehiculos = pd.read_csv('./data/MotorAlpes_data.csv', sep=',', encoding = 'utf-8')

In [None]:
df_vehiculos.shape

In [None]:
df_vehiculos.sample(5)

## 2.  Entendimiento y limpieza de los datos

Una variable que siempre tiene que estar es `Selling_price`, ya que la intención de este modelo es identificar las variables que impactar el precio de un vehiculo usado. Así como predecir el precio de un vehiculo a partir de esas varibles que afectan el precio.

Para esto solo se ha seleccionado variables numericas.

In [None]:
variables_interes = ['year', 'km_driven', 'seats', 'mileage', 'engine','max_power','transmission','fuel','seller_type','owner']

df_vehiculos_t = df_vehiculos.copy()

In [None]:
df_vehiculos_t[['selling_price'] + variables_interes].isnull().sum() / df_vehiculos_t.shape[0]

In [None]:
encoder = LabelEncoder()
df_vehiculos_t["owner"] = encoder.fit_transform(df_vehiculos_t["owner"])
print(encoder.classes_)
df_vehiculos_t["seller_type"] = encoder.fit_transform(df_vehiculos_t["seller_type"])
print(encoder.classes_)

df_vehiculos_t["fuel"] = encoder.fit_transform(df_vehiculos_t["fuel"])
print(encoder.classes_)

df_vehiculos_t["transmission"] = encoder.fit_transform(df_vehiculos_t["transmission"])
print(encoder.classes_)

df_vehiculos_t.sample(5)



## 2.1 Relaciones entre variables
A continuación vamos a visualizar la correlación entre selling price y las variables de interes.

In [None]:
sns.pairplot(df_vehiculos_t, height = 4, y_vars = 'selling_price', x_vars = variables_interes, kind = 'scatter')
plt.show()

In [None]:
plt.figure(figsize = (12, 10))
sns.heatmap(df_vehiculos_t[variables_interes].corr(), cmap = 'vlag', vmin = -1, vmax = 1)
plt.show()

Cambio de los valores nulos por la media de los datos por cada columna

In [None]:
df_vehiculos_copy = df_vehiculos_t.copy()

In [None]:
def fill_na_all(df, var_to_transform):
    for i in var_to_transform:
        mean = df[i].mean()
        df[i]=df[i].fillna(mean)
    mean = df['selling_price'].mean()
    df['selling_price']=df['selling_price'].fillna(mean)
    return df
        

In [None]:
df_vehiculos_copy= fill_na_all(df_vehiculos_copy,variables_interes)

In [None]:
df_vehiculos_copy.sample(5)
df_vehiculos_copy[['selling_price'] + variables_interes].isnull().sum() / df_vehiculos_copy.shape[0]
#Se muestra que se ha pasado los nulos a valores promedio

In [None]:
sns.pairplot(df_vehiculos_copy, height = 4, y_vars = 'selling_price', x_vars = variables_interes, kind = 'scatter')
plt.show()

## 3.Particionamiento del conjunto de datos.
Separación datos x y y para entrenamiento y x y y para test.

In [None]:
x_train, x_test, y_train,  y_test  = train_test_split(df_vehiculos_copy[variables_interes], df_vehiculos_copy['selling_price'], test_size=0.3, random_state=1)
x_train.shape, y_train.shape
x_test.shape, y_test.shape

In [None]:
df_vehiculos_test_final = pd.read_csv('./data/MotorAlpes_test.csv', sep=',', encoding = 'utf-8')
X_test = df_vehiculos_test_final[variables_interes]
X_test.shape

## 4. Entrenamiento de un primer modelo
Para este primer modelo solo se va a remplazar los valores nulos a la media  de todos los datos.

In [None]:
regression = LinearRegression()
regression.fit(x_train, y_train)

### 4.1 Interpretación de los coeficientes y del intercepto

En base a los coeficientes anteriormente mostrados, solo datos como engine, max_power, y km_driven tienen un coeficiente de puridad acceptablle .

In [None]:
f, axs = plt.subplots(1, len(variables_interes), sharey = True, figsize = (20, 4))

for i in range(len(variables_interes)):
    col = variables_interes[i]
    x = x_train[col]
    m = regression.coef_[i]
    b = regression.intercept_

    axs[i].plot(x, y_train, 'o', alpha = 0.1)
    axs[i].plot(x, x * m + b)
    axs[i].set_title(col)
plt.show()

In [None]:
pd.DataFrame({'columns': variables_interes, 'coef': regression.coef_})

In [None]:
regression.intercept_

De lo anterior se puede interpretar que hay mukchos valores atipicos que no permiten detectar una relación lineal entre el precio y alguna de las caracteristicas.

## 4.2 Validación del modelo
### Mean Absolute Error


In [None]:
print('Train:', mean_absolute_error(y_train, regression.predict(x_train)))
print('Test:', mean_absolute_error(y_test, regression.predict(x_test)))

### Root Mean Squeared Error (RMSE)

In [None]:
 print('Train:', np.sqrt(mean_squared_error(y_train, regression.predict(x_train))))
print('Test:', np.sqrt(mean_squared_error(y_test, regression.predict(x_test))))

In [None]:
plt.figure(figsize = (15, 3))
sns.boxplot(x = y_test, showmeans = True, orient = 'h')
plt.title('Valor real de $\t{selling\_price}$')
plt.grid()
plt.show()

## 5. Entrenamiento 2do Modelo


In [None]:
def fill_outliers(df, var_to_transform):    
    q_low = df['selling_price'].quantile(0.25)
    q_hi  = df['selling_price'].quantile(0.75)
    iqr=q_hi-q_low
    maximus = q_hi-1.5*iqr
    minimus = q_low-1.5*iqr

    mean = df['selling_price'].mean()



    df = df.mask(df['selling_price']>maximus,q_hi)
    df = df.mask(df['selling_price']<minimus,q_low)
    return df
        
 

In [None]:
df_vehiculos_transformed=fill_outliers(df_vehiculos_copy,variables_interes)

In [None]:
sns.pairplot(df_vehiculos_transformed, height = 4, y_vars = 'selling_price', x_vars = variables_interes, kind = 'scatter')
plt.show()

In [None]:
regression_2 = LinearRegression()
regression_2.fit(x_train, y_train)

In [None]:
f, axs = plt.subplots(1, len(variables_interes), sharey = True, figsize = (20, 4))

for i in range(len(variables_interes)):
    col = variables_interes[i]
    x = x_train[col]
    m = regression_2.coef_[i]
    b = regression_2.intercept_

    axs[i].plot(x, y_train, 'o', alpha = 0.1)
    axs[i].plot(x, x * m + b)
    axs[i].set_title(col)
plt.show()

In [None]:
regression_2.intercept_

In [None]:
pd.DataFrame({'columns': variables_interes, 'coef': regression_2.coef_})

### 5.2 Validación 2do Modelo


#### Mean Absolute Error


In [None]:
print('Train:', mean_absolute_error(y_train, regression_2.predict(x_train)))
print('Test:', mean_absolute_error(y_test, regression_2.predict(x_test)))

In [None]:
 print('Train:', np.sqrt(mean_squared_error(y_train, regression_2.predict(x_train))))
print('Test:', np.sqrt(mean_squared_error(y_test, regression_2.predict(x_test))))

In [None]:
plt.figure(figsize = (15, 3))
sns.boxplot(x = y_test, showmeans = True, orient = 'h')
plt.title('Valor real de $\t{selling\_price}$')
plt.grid()
plt.show()

## 6. Entrenamiento 3er Modelo
Para este tercer modelo se plantea usar metodos de normalización con tal de llegar a quitar el efecto de los valores atipicos del modelo.

In [None]:
data_tercer_modelo = df_vehiculos_copy.copy()
def normalize_cols(df, var_to_transform):
    scaler = MinMaxScaler()

    arr_scaled = scaler.fit_transform(df)
    df_scaled = pd.DataFrame(arr_scaled, columns=df.columns,index=df.index)

    return df_scaled

        

In [None]:
norm_data=normalize_cols(data_tercer_modelo,variables_interes)
norm_data.sample(5)

In [None]:
x_train, y_train  = norm_data[variables_interes], df_vehiculos_transformed['selling_price']
x_train.shape, y_train.shape

In [None]:
regression_3 = LinearRegression()
regression_3.fit(x_train, y_train)
regression_3.intercept_

In [None]:
f, axs = plt.subplots(1, len(variables_interes), sharey = True, figsize = (20, 4))

for i in range(len(variables_interes)):
    col = variables_interes[i]
    x = x_train[col]
    m = regression_3.coef_[i]
    b = regression_3.intercept_

    axs[i].plot(x, y_train, 'o', alpha = 0.1)
    axs[i].plot(x, x * m + b)
    axs[i].set_title(col)
plt.show()

In [None]:
pd.DataFrame({'columns': variables_interes, 'coef': regression_3.coef_})

## 7. Entrenamiento 4to Modelo

Para este segundo modelo se va a aplicar metodos de normalización con tal 

In [None]:
pipeline = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]
)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pd.DataFrame({'columns': variables_interes, 'coef': pipeline['model'].coef_})

In [None]:
pipeline['model'].intercept_