In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
csv_path = os.path.join('..','datasets','raw','data.csv')
df = pd.read_csv(csv_path)
df.sample(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
6676,Chevrolet,Malibu,2015,regular unleaded,196.0,4.0,AUTOMATIC,front wheel drive,4.0,,Midsize,Sedan,36,25,1385,23610
866,Saab,9-5,2011,premium unleaded (recommended),220.0,4.0,MANUAL,front wheel drive,4.0,Luxury,Large,Sedan,33,20,376,38525
10759,Chevrolet,Traverse,2017,regular unleaded,281.0,6.0,AUTOMATIC,front wheel drive,4.0,Crossover,Midsize,4dr SUV,22,15,1385,31300
9395,GMC,Sierra 1500,2017,flex-fuel (unleaded/E85),285.0,6.0,AUTOMATIC,rear wheel drive,4.0,Flex Fuel,Large,Crew Cab Pickup,24,18,549,40025
6310,Buick,LaCrosse,2016,flex-fuel (unleaded/E85),304.0,6.0,AUTOMATIC,front wheel drive,4.0,Flex Fuel,Large,Sedan,28,18,155,31065
4994,Nissan,Frontier,2016,regular unleaded,261.0,6.0,AUTOMATIC,rear wheel drive,4.0,,Compact,Crew Cab Pickup,22,16,2009,31710
2606,Honda,Civic,2017,regular unleaded,158.0,4.0,MANUAL,front wheel drive,2.0,,Compact,Coupe,39,28,2202,19150
7077,Ford,Mustang,2015,premium unleaded (recommended),310.0,4.0,MANUAL,rear wheel drive,2.0,High-Performance,Midsize,Coupe,31,22,5657,29300
2837,Plymouth,Colt,1994,regular unleaded,92.0,4.0,MANUAL,front wheel drive,2.0,,Compact,Coupe,35,28,535,2000
14,BMW,1 Series,2013,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,19,3916,31500


In [3]:
len(df)

11914

In [4]:
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [5]:
#my_variables = ['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']
cars = df.copy()

# Lowercase the column names and replace spaces with underscores
cars.columns = cars.columns.str.lower().str.replace(' ','_')
# Lista de la variables categóricas
categorical = list(cars.dtypes[cars.dtypes == 'object'].index)
# Lista de las variables numéricas
numerical = list(cars.dtypes[cars.dtypes != 'object'].index)


for feature in categorical:
    cars[feature] = cars[feature].str.lower().str.replace(' ', '_')
    
cars.sample(5)

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
9243,toyota,sienna,2017,regular_unleaded,296.0,6.0,automatic,front_wheel_drive,4.0,,large,passenger_minivan,27,19,2031,32540
4640,ford,f-250,1999,regular_unleaded,220.0,8.0,automatic,rear_wheel_drive,4.0,,large,extended_cab_pickup,17,12,5657,3752
4953,nissan,frontier,2015,regular_unleaded,261.0,6.0,manual,four_wheel_drive,4.0,,compact,extended_cab_pickup,21,16,2009,27240
4504,ford,f-150,2015,flex-fuel_(unleaded/e85),282.0,6.0,automatic,rear_wheel_drive,4.0,flex_fuel,large,crew_cab_pickup,25,18,5657,32545
9412,gmc,sierra_classic_1500,1999,regular_unleaded,200.0,8.0,automatic,four_wheel_drive,3.0,,large,extended_cab_pickup,16,12,549,4827


In [6]:
# Valores pérdidos
cars.isnull().sum()

make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp              69
engine_cylinders       30
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64

In [7]:
# Relleno valores pérdidos con 0
cars[['engine_hp','engine_cylinders']] = cars[['engine_hp','engine_cylinders']].fillna(value=0)

categorical.remove('market_category')

In [8]:
categorical

['make',
 'model',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'vehicle_size',
 'vehicle_style']

In [9]:
# Creación de una nueva variables objetivo

# Valor de la media de msrp
msrp_mean = cars['msrp'].mean()

# Precio por debajo de la media
cars['above_average'] = (cars['msrp'] <= msrp_mean).astype(int)

# Distribución
cars['above_average'].value_counts()

above_average
1    8645
0    3269
Name: count, dtype: int64

In [10]:
# Cantidad de valores únicos por variable(categórica)
cars[categorical].nunique()

make                  48
model                914
engine_fuel_type      10
transmission_type      5
driven_wheels          4
vehicle_size           3
vehicle_style         16
dtype: int64

In [11]:
df_train_full, df_test = train_test_split(cars, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=1)

In [12]:
# Lista los valores únicos de las variables categóricas
for feature in categorical:
    if df_train_full[feature].nunique() <= 16:
        print(feature)
        print(f"{df_train_full[feature].unique()} \n")
        

engine_fuel_type
['premium_unleaded_(required)' 'flex-fuel_(unleaded/e85)'
 'premium_unleaded_(recommended)' 'regular_unleaded' 'diesel'
 'flex-fuel_(premium_unleaded_required/e85)' 'electric'
 'flex-fuel_(premium_unleaded_recommended/e85)'
 'flex-fuel_(unleaded/natural_gas)' nan 'natural_gas'] 

transmission_type
['automatic' 'manual' 'automated_manual' 'direct_drive' 'unknown'] 

driven_wheels
['rear_wheel_drive' 'all_wheel_drive' 'front_wheel_drive'
 'four_wheel_drive'] 

vehicle_size
['compact' 'large' 'midsize'] 

vehicle_style
['coupe' 'extended_cab_pickup' 'convertible' '4dr_hatchback' '4dr_suv'
 'sedan' 'passenger_minivan' '2dr_suv' 'wagon' 'passenger_van'
 'crew_cab_pickup' 'regular_cab_pickup' 'convertible_suv' '2dr_hatchback'
 'cargo_minivan' 'cargo_van'] 



In [13]:
# Entradas con valor 'unknown' en variable 'trasnmission_type'
df_train_full[df_train_full.transmission_type == "unknown"]

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp,above_average
6158,gmc,jimmy,1999,regular_unleaded,190.0,6.0,unknown,rear_wheel_drive,2.0,,compact,2dr_suv,19,14,549,2182,1
8053,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
6174,gmc,jimmy,2000,regular_unleaded,190.0,6.0,unknown,four_wheel_drive,2.0,,compact,2dr_suv,18,14,549,2578,1
4691,pontiac,firebird,2000,regular_unleaded,305.0,8.0,unknown,rear_wheel_drive,2.0,"hatchback,performance",midsize,2dr_hatchback,23,15,210,6175,1
8048,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
8047,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
6160,gmc,jimmy,1999,regular_unleaded,190.0,6.0,unknown,four_wheel_drive,2.0,,compact,2dr_suv,19,14,549,2317,1
8046,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
8051,dodge,ram_150,1991,regular_unleaded,125.0,6.0,unknown,rear_wheel_drive,2.0,,large,regular_cab_pickup,17,12,1851,2000,1
6368,chrysler,le_baron,1993,regular_unleaded,100.0,4.0,unknown,front_wheel_drive,2.0,,compact,convertible,24,18,1013,2000,1


## Feature Importance

### Mutual Information

In [15]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    """
    Calcula el puntaje de información mutua de las variables categóricas 
    con respecto a 'above_average'
    """
    return mutual_info_score(series, df_train_full.above_average)


df_train_full.dropna(inplace=True)
mi_scores = df_train_full[categorical].apply(calculate_mi)
mi_scores.sort_values(ascending=False)

model                0.513520
make                 0.253498
engine_fuel_type     0.132644
driven_wheels        0.108291
vehicle_style        0.107374
vehicle_size         0.039284
transmission_type    0.010569
dtype: float64

### Risk Ratio

In [17]:
manual_transmission = df_train_full[df_train_full.transmission_type == 'manual'].above_average.mean()
automatic_transmission = df_train_full[df_train_full.transmission_type == 'automatic'].above_average.mean()
manual_transmission, automatic_transmission

(0.7520058351568198, 0.6046157195732637)

In [19]:
compact_size = df_train_full[df_train_full.vehicle_size == 'compact'].above_average.mean()
mid_size = df_train_full[df_train_full.vehicle_size == 'midsize'].above_average.mean()
large_size = df_train_full[df_train_full.vehicle_size == 'large'].above_average.mean()
compact_silarge_size

0.41967871485943775

### Correlation Coefficient

In [None]:
df_train_full[numerical].corrwith(df_train_full.above_average).sort_values(ascending=True)

In [None]:
df_train_full[numerical].corrwith(df_train_full.msrp).sort_values(ascending=True)

## Fearure Engineering

In [None]:
# Millas por galón promedio
df_train_full['mpg_avg'] = df_train_full['highway_mpg'] + df_train_full['city_mpg'] / 2

# Reemplaza 'unknown' por 'manual' en variable 'transmission_type'
df_train_full['transmission_type'] = df_train_full.transmission_type.str.replace('unknown', 'manual')

categorical_features = ['make', 'model', 'transmission_type',]

numerical_features = ['year', 'engine_hp', 'engine_cylinders']

In [None]:
# Importa la clase Dict vectorizer para convertir datos categóricos en formato dicciionario a formato numérico
dv = DictVectorizer(sparse=False)

# Convierte el conjunto de entrenamiento en un formato de lista de diccionarios
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

# Crea un modelo de regresión logística
model = LogisticRegression(solver='liblinear')

# Entrena el modelo de regresión logística
model.fit(X_train, y_train)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

prob = (y_pred >= .5).astype(int)
acc = accuracy_score(y_val, prob).round(1)
prec = precision_score(y_val, prob).round(1)
rec = recall_score(y_val, prob).round(1)

print(acc)
print(prec)
print(rec)


In [None]:
thresholds = np.linspace(0, 1, 101)

accuracies = []
for t in thresholds:
    prob = y_pred >= t
    acc = accuracy_score(y_val, prob >= t)
    accuracies.append(acc)
    

plt.plot(thresholds, accuracies)

### Precision and Recall

In [None]:
from sklearn.metrics import precision_score, recall_score, precision_recall_curve

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred)

def plot_precision_recall_vs_thresholds(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recalls")


plot_precision_recall_vs_thresholds(precisions, recalls, thresholds)
plt.show()

In [None]:

plt.plot(thresholds, precisions[:-1], "b--", label="Precision")

In [None]:
plt.plot(recalls, precisions, "g-")
plt.title("Precision vs Recall")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()