In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
csv_path = os.path.join('..','datasets','raw','data.csv')
df = pd.read_csv(csv_path)
df.sample(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
4006,GMC,Envoy,2007,regular unleaded,291.0,6.0,AUTOMATIC,four wheel drive,4.0,,Midsize,4dr SUV,20,14,549,32960
10875,Acura,TSX,2012,premium unleaded (recommended),201.0,4.0,AUTOMATIC,front wheel drive,4.0,Luxury,Midsize,Sedan,31,22,204,30010
7783,Infiniti,Q50,2016,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Luxury,Performance",Midsize,Sedan,27,19,190,41900
863,Saab,9-5,2009,premium unleaded (recommended),260.0,4.0,AUTOMATIC,front wheel drive,4.0,"Luxury,Performance",Midsize,Wagon,27,17,376,40765
3922,Buick,Enclave,2015,regular unleaded,288.0,6.0,AUTOMATIC,front wheel drive,4.0,Crossover,Midsize,4dr SUV,24,17,155,43450
2399,Volkswagen,CC,2015,premium unleaded (recommended),200.0,4.0,AUTOMATED_MANUAL,front wheel drive,4.0,Performance,Midsize,Sedan,31,22,873,35375
1950,Volkswagen,Beetle,2017,regular unleaded,170.0,4.0,AUTOMATIC,front wheel drive,2.0,Hatchback,Compact,2dr Hatchback,33,24,873,19995
7373,Mitsubishi,Outlander Sport,2017,regular unleaded,148.0,4.0,AUTOMATIC,four wheel drive,4.0,Crossover,Compact,4dr SUV,29,23,436,22495
2306,GMC,Canyon,2016,regular unleaded,305.0,6.0,AUTOMATIC,rear wheel drive,4.0,Diesel,Compact,Crew Cab Pickup,26,18,549,30765
1568,Oldsmobile,Aurora,2002,regular unleaded,215.0,6.0,AUTOMATIC,front wheel drive,4.0,,Large,Sedan,25,16,26,31615


In [3]:
len(df)

11914

In [4]:
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [5]:
#my_variables = ['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']
cars = df.copy()

# Lowercase the column names and replace spaces with underscores
cars.columns = cars.columns.str.lower().str.replace(' ','_')
# Lista de la variables categóricas
categorical = list(cars.dtypes[cars.dtypes == 'object'].index)
# Lista de las variables numéricas
numerical = list(cars.dtypes[cars.dtypes != 'object'].index)


for feature in categorical:
    cars[feature] = cars[feature].str.lower().str.replace(' ', '_')
    
cars.sample(5)

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
4836,subaru,forester,2015,regular_unleaded,170.0,4.0,manual,all_wheel_drive,4.0,crossover,midsize,4dr_suv,29,22,640,24595
1900,volkswagen,beetle,2015,premium_unleaded_(recommended),210.0,4.0,automated_manual,front_wheel_drive,2.0,"hatchback,performance",compact,2dr_hatchback,30,24,873,26550
3958,buick,encore,2017,regular_unleaded,138.0,4.0,automatic,all_wheel_drive,4.0,crossover,compact,4dr_suv,30,24,155,28365
7561,rolls-royce,phantom,2015,premium_unleaded_(required),453.0,12.0,automatic,rear_wheel_drive,4.0,"exotic,luxury,performance",large,sedan,19,11,86,407400
8246,land_rover,range_rover_evoque,2015,premium_unleaded_(recommended),240.0,4.0,automatic,all_wheel_drive,4.0,"crossover,luxury",compact,4dr_suv,30,21,258,41100


In [6]:
# Valores pérdidos
cars.isnull().sum()

make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp              69
engine_cylinders       30
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64

In [7]:
# Relleno valores pérdidos con 0
cars[['engine_hp','engine_cylinders']] = cars[['engine_hp','engine_cylinders']].fillna(value=0)

categorical.remove('market_category')

In [8]:
categorical

['make',
 'model',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'vehicle_size',
 'vehicle_style']

In [9]:
# Creación de una nueva variables objetivo

# Valor de la media de msrp
msrp_mean = cars['msrp'].mean()

# Precio por debajo de la media
cars['above_average'] = (cars['msrp'] <= msrp_mean).astype(int)

# Distribución
cars['above_average'].value_counts()

above_average
1    8645
0    3269
Name: count, dtype: int64

In [10]:
# Cantidad de valores únicos por variable(categórica)
cars[categorical].nunique()

make                  48
model                914
engine_fuel_type      10
transmission_type      5
driven_wheels          4
vehicle_size           3
vehicle_style         16
dtype: int64

In [11]:
df_train_full, df_test = train_test_split(cars, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=1)

y_train = df_train.msrp.values
y_val = df_val.msrp.values

del df_train['msrp']
del df_val['msrp']

In [12]:
# Lista los valores únicos de las variables categóricas
for feature in categorical:
    if df_train_full[feature].nunique() <= 16:
        print(feature)
        print(f"{df_train_full[feature].unique()} \n")
        

engine_fuel_type
['premium_unleaded_(required)' 'flex-fuel_(unleaded/e85)'
 'premium_unleaded_(recommended)' 'regular_unleaded' 'diesel'
 'flex-fuel_(premium_unleaded_required/e85)' 'electric'
 'flex-fuel_(premium_unleaded_recommended/e85)'
 'flex-fuel_(unleaded/natural_gas)' nan 'natural_gas'] 

transmission_type
['automatic' 'manual' 'automated_manual' 'direct_drive' 'unknown'] 

driven_wheels
['rear_wheel_drive' 'all_wheel_drive' 'front_wheel_drive'
 'four_wheel_drive'] 

vehicle_size
['compact' 'large' 'midsize'] 

vehicle_style
['coupe' 'extended_cab_pickup' 'convertible' '4dr_hatchback' '4dr_suv'
 'sedan' 'passenger_minivan' '2dr_suv' 'wagon' 'passenger_van'
 'crew_cab_pickup' 'regular_cab_pickup' 'convertible_suv' '2dr_hatchback'
 'cargo_minivan' 'cargo_van'] 



In [13]:
# Entradas con valor 'unknown' en variable 'trasnmission_type'
df_train_full[df_train_full.transmission_type == "unknown"]

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp,above_average
6158,gmc,jimmy,1999,regular_unleaded,190.0,6.0,unknown,rear_wheel_drive,2.0,,compact,2dr_suv,19,14,549,2182,1
8053,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
6174,gmc,jimmy,2000,regular_unleaded,190.0,6.0,unknown,four_wheel_drive,2.0,,compact,2dr_suv,18,14,549,2578,1
4691,pontiac,firebird,2000,regular_unleaded,305.0,8.0,unknown,rear_wheel_drive,2.0,"hatchback,performance",midsize,2dr_hatchback,23,15,210,6175,1
8048,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
8047,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
6160,gmc,jimmy,1999,regular_unleaded,190.0,6.0,unknown,four_wheel_drive,2.0,,compact,2dr_suv,19,14,549,2317,1
8046,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
8051,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
6368,chrysler,le_baron,1993,regular_unleaded,100.0,4.0,unknown,front_wheel_drive,2.0,,compact,convertible,24,18,1013,2000,1


## Feature Importance

### Mutual Information

In [14]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    """
    Calcula el puntaje de información mutua de las variables categóricas 
    con respecto a 'above_average'
    """
    return mutual_info_score(series, df_train_full.above_average)


df_train_full.dropna(inplace=True)
mi_scores = df_train_full[categorical].apply(calculate_mi)
mi_scores.sort_values(ascending=False)

model                0.513520
make                 0.253498
engine_fuel_type     0.132644
driven_wheels        0.108291
vehicle_style        0.107374
vehicle_size         0.039284
transmission_type    0.010569
dtype: float64

### Risk Ratio

In [15]:
global_mean = df_train_full.above_average.mean()
global_mean

0.628804180756225

In [16]:
manual_transmission = df_train_full[df_train_full.transmission_type == 'manual'].above_average.mean()
automatic_transmission = df_train_full[df_train_full.transmission_type == 'automatic'].above_average.mean()
manual_transmission, automatic_transmission

(0.7520058351568198, 0.6046157195732637)

In [17]:
compact_size = df_train_full[df_train_full.vehicle_size == 'compact'].above_average.mean()
mid_size = df_train_full[df_train_full.vehicle_size == 'midsize'].above_average.mean()
large_size = df_train_full[df_train_full.vehicle_size == 'large'].above_average.mean()
compact_size

0.7739059967585089

### Correlation Coefficient

In [18]:
df_train_full[numerical].corrwith(df_train_full.above_average).sort_values(ascending=True)

engine_hp          -0.633918
engine_cylinders   -0.468945
msrp               -0.447016
year               -0.252045
popularity          0.003677
number_of_doors     0.089007
highway_mpg         0.176721
city_mpg            0.193675
dtype: float64

In [19]:
df_train_full[numerical].corrwith(df_train_full.msrp).sort_values(ascending=True)

highway_mpg        -0.208097
city_mpg           -0.195316
number_of_doors    -0.182817
popularity         -0.052330
year                0.130847
engine_cylinders    0.570702
engine_hp           0.641806
msrp                1.000000
dtype: float64

## Fearure Engineering

In [28]:
# Millas por galón promedio
df_train_full['mpg_avg'] = df_train_full['highway_mpg'] + df_train_full['city_mpg'] / 2

# Reemplaza 'unknown' por 'manual' en variable 'transmission_type'
df_train_full['transmission_type'] = df_train_full.transmission_type.str.replace('unknown', 'manual')

categorical_features = ['make', 'model', 'transmission_type',]

numerical_features = ['year', 'engine_hp', 'engine_cylinders']


In [30]:
# Importa la clase Dict vectorizer para convertir datos categóricos en formato dicciionario a formato numérico
dv = DictVectorizer(sparse=False)

# Convierte el conjunto de entrenamiento en un formato de lista de diccionarios
train_dict = df_train[categorical_features + numerical_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

# Crea un modelo de regresión logística
#model = LogisticRegression(solver='liblinear')

# Entrena el modelo de regresión logística
#model.fit(X_train, y_train)

#val_dict = df_val.to_dict(orient='records')
#X_val = dv.transform(val_dict)

#y_pred = model.predict_proba(X_val)[:, 1]
#y_pred

X_train

array([[6.000e+00, 3.480e+02, 0.000e+00, ..., 1.000e+00, 0.000e+00,
        2.013e+03],
       [6.000e+00, 2.750e+02, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        2.017e+03],
       [4.000e+00, 1.400e+02, 0.000e+00, ..., 1.000e+00, 0.000e+00,
        1.992e+03],
       ...,
       [4.000e+00, 1.300e+02, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        2.017e+03],
       [5.000e+00, 1.620e+02, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.991e+03],
       [6.000e+00, 2.400e+02, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        2.003e+03]])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

prob = (y_pred >= .5).astype(int)
acc = accuracy_score(y_val, prob).round(1)
prec = precision_score(y_val, prob).round(1)
rec = recall_score(y_val, prob).round(1)

print(acc)
print(prec)
print(rec)


In [None]:
thresholds = np.linspace(0, 1, 101)

accuracies = []
for t in thresholds:
    prob = y_pred >= t
    acc = accuracy_score(y_val, prob >= t)
    accuracies.append(acc)
    

plt.plot(thresholds, accuracies)

### Precision and Recall

In [None]:
from sklearn.metrics import precision_score, recall_score, precision_recall_curve

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred)

def plot_precision_recall_vs_thresholds(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recalls")


plot_precision_recall_vs_thresholds(precisions, recalls, thresholds)
plt.show()

In [None]:

plt.plot(thresholds, precisions[:-1], "b--", label="Precision")

In [None]:
plt.plot(recalls, precisions, "g-")
plt.title("Precision vs Recall")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()