In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
csv_path = os.path.join('..','datasets','raw','data.csv')
df = pd.read_csv(csv_path)
df.sample(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
9675,Mercedes-Benz,SLK-Class,2014,premium unleaded (required),201.0,4.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,32,22,617,43525
1240,Honda,Accord,2016,regular unleaded,185.0,4.0,AUTOMATIC,front wheel drive,2.0,,Midsize,Coupe,35,26,2202,27850
6884,Lincoln,MKX,2016,regular unleaded,303.0,6.0,AUTOMATIC,front wheel drive,4.0,"Crossover,Luxury",Midsize,4dr SUV,26,17,61,45315
7203,Volkswagen,New Beetle,2008,regular unleaded,150.0,5.0,AUTOMATIC,front wheel drive,2.0,Hatchback,Compact,2dr Hatchback,29,20,873,20990
865,Saab,9-5,2010,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Luxury,Performance",Large,Sedan,27,16,376,49165
20,Audi,100,1992,regular unleaded,172.0,6.0,MANUAL,front wheel drive,4.0,Luxury,Midsize,Sedan,24,17,3105,2000
10226,Toyota,Tacoma,2017,regular unleaded,159.0,4.0,AUTOMATIC,rear wheel drive,4.0,,Compact,Crew Cab Pickup,23,19,2031,24950
2980,Chevrolet,Corvette,2015,premium unleaded (required),650.0,8.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,High-Performance",Compact,Coupe,22,15,1385,82270
5096,Infiniti,G Coupe,2013,premium unleaded (recommended),330.0,6.0,AUTOMATIC,rear wheel drive,2.0,"Luxury,Performance",Midsize,Coupe,27,19,190,40400
3581,Land Rover,Discovery Sport,2015,premium unleaded (recommended),240.0,4.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Luxury",Midsize,4dr SUV,26,20,258,37070


In [3]:
len(df)

11914

In [4]:
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [5]:
#my_variables = ['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']
cars = df.copy()

# Lowercase the column names and replace spaces with underscores
cars.columns = cars.columns.str.lower().str.replace(' ','_')
# Lista de la variables categóricas
categorical = list(cars.dtypes[cars.dtypes == 'object'].index)
# Lista de las variables numéricas
numerical = list(cars.dtypes[cars.dtypes != 'object'].index)


for feature in categorical:
    cars[feature] = cars[feature].str.lower().str.replace(' ', '_')
    
cars.sample(5)

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
6772,acura,mdx,2016,premium_unleaded_(recommended),290.0,6.0,automatic,front_wheel_drive,4.0,"crossover,luxury",midsize,4dr_suv,27,20,204,53230
11111,volvo,v50,2010,premium_unleaded_(recommended),227.0,5.0,manual,all_wheel_drive,4.0,luxury,compact,wagon,26,20,870,33050
114,mazda,2,2012,regular_unleaded,100.0,4.0,automatic,front_wheel_drive,4.0,hatchback,compact,4dr_hatchback,34,28,586,16860
2130,volkswagen,cabrio,2001,regular_unleaded,115.0,4.0,manual,front_wheel_drive,2.0,,compact,convertible,28,21,873,19600
7209,volkswagen,new_beetle,2009,regular_unleaded,150.0,5.0,automatic,front_wheel_drive,2.0,,compact,convertible,28,20,873,26990


In [6]:
# Valores pérdidos
cars.isnull().sum()

make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp              69
engine_cylinders       30
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64

In [7]:
# Relleno valores pérdidos con 0
cars[['engine_hp','engine_cylinders']] = cars[['engine_hp','engine_cylinders']].fillna(value=0)

categorical.remove('market_category')

In [8]:
categorical

['make',
 'model',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'vehicle_size',
 'vehicle_style']

In [18]:
# Creación de una nueva variables objetivo

# Valor de la media de msrp
msrp_mean = cars['msrp'].mean()

# Precio por debajo de la media
cars['above_average'] = (cars['msrp'] <= msrp_mean).astype(int)

# Distribución
cars['above_average'].value_counts()

above_average
1    6241
0    1925
Name: count, dtype: int64

In [10]:
cars[categorical].nunique()

make                  48
model                914
engine_fuel_type      10
transmission_type      5
driven_wheels          4
vehicle_size           3
vehicle_style         16
dtype: int64

In [21]:
df_train_full, df_test = train_test_split(cars, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=1)

In [49]:
for feature in categorical:
    if df_train_full[feature].nunique() <= 16:
        print(feature)
        print(df_train_full[feature].unique())
        print()

engine_fuel_type
['premium_unleaded_(recommended)' 'regular_unleaded'
 'premium_unleaded_(required)' 'flex-fuel_(unleaded/e85)'
 'flex-fuel_(premium_unleaded_required/e85)' 'electric' 'diesel'
 'flex-fuel_(premium_unleaded_recommended/e85)'
 'flex-fuel_(unleaded/natural_gas)']

transmission_type
['automatic' 'manual' 'automated_manual' 'direct_drive' 'unknown']

driven_wheels
['front_wheel_drive' 'all_wheel_drive' 'four_wheel_drive'
 'rear_wheel_drive']

vehicle_size
['midsize' 'compact' 'large']

vehicle_style
['sedan' '4dr_suv' '4dr_hatchback' '2dr_suv' '2dr_hatchback' 'coupe'
 'crew_cab_pickup' 'wagon' 'convertible' 'extended_cab_pickup'
 'regular_cab_pickup' 'cargo_van' 'cargo_minivan' 'passenger_van'
 'passenger_minivan' 'convertible_suv']



## Feature Importance

### Mutual Information

In [23]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, df_train_full.above_average)

df_train_full.dropna(inplace=True)
mi_scores = df_train_full[categorical].apply(calculate_mi)
mi_scores.sort_values(ascending=False)

model                0.461924
make                 0.227155
engine_fuel_type     0.153522
driven_wheels        0.098708
vehicle_style        0.090020
vehicle_size         0.028881
transmission_type    0.005871
dtype: float64

### Correlation Coefficient

In [24]:
df_train_full[numerical].corrwith(df_train_full.above_average).sort_values(ascending=True)

engine_hp          -0.660259
engine_cylinders   -0.526025
msrp               -0.509839
year               -0.173952
popularity          0.017847
number_of_doors     0.144071
highway_mpg         0.180728
city_mpg            0.196060
dtype: float64

In [25]:
df_train_full[numerical].corrwith(df_train_full.msrp).sort_values(ascending=True)

highway_mpg        -0.199950
city_mpg           -0.189692
number_of_doors    -0.174963
popularity         -0.049570
year                0.125311
engine_cylinders    0.554486
engine_hp           0.628689
msrp                1.000000
dtype: float64

## Fearure Engineering

In [34]:
# Millas por galón promedio
df_train_full['mpg_avg'] = df_train_full['highway_mpg'] + df_train_full['city_mpg'] / 2


In [31]:
df_train_full['mpg_avg']

6727     35.5
8386     40.0
4714     54.0
10518    32.0
9832     35.0
         ... 
11618    35.5
7349     44.5
5675     41.5
262      32.5
7290     38.0
Name: mpg_avg, Length: 6532, dtype: float64

In [26]:
# Importa la clase Dict vectorizer para convertir datos categóricos en formato dicciionario a formato numérico
dv = DictVectorizer(sparse=False)

# Convierte el conjunto de entrenamiento en un formato de lista de diccionarios
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

# Crea un modelo de regresión logística
model = LogisticRegression(solver='liblinear')

# Entrena el modelo de regresión logística
model.fit(X_train, y_train)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

NameError: name 'y_train' is not defined

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

prob = (y_pred >= .5).astype(int)
acc = accuracy_score(y_val, prob).round(1)
prec = precision_score(y_val, prob).round(1)
rec = recall_score(y_val, prob).round(1)

print(acc)
print(prec)
print(rec)


In [None]:
thresholds = np.linspace(0, 1, 101)

accuracies = []
for t in thresholds:
    prob = y_pred >= t
    acc = accuracy_score(y_val, prob >= t)
    accuracies.append(acc)
    

plt.plot(thresholds, accuracies)

### Precision and Recall

In [None]:
from sklearn.metrics import precision_score, recall_score, precision_recall_curve

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred)

def plot_precision_recall_vs_thresholds(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recalls")


plot_precision_recall_vs_thresholds(precisions, recalls, thresholds)
plt.show()

In [None]:

plt.plot(thresholds, precisions[:-1], "b--", label="Precision")

In [None]:
plt.plot(recalls, precisions, "g-")
plt.title("Precision vs Recall")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()