In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
csv_path = os.path.join('..','datasets','raw','data.csv')
df = pd.read_csv(csv_path)
df.sample(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
9954,Audi,SQ5,2017,premium unleaded (required),354.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Factory Tuner,Luxury,Performance",Midsize,4dr SUV,24,17,3105,60800
1849,Volkswagen,Beetle Convertible,2015,premium unleaded (recommended),210.0,4.0,AUTOMATED_MANUAL,front wheel drive,2.0,Performance,Compact,Convertible,29,23,873,35975
4006,GMC,Envoy,2007,regular unleaded,291.0,6.0,AUTOMATIC,four wheel drive,4.0,,Midsize,4dr SUV,20,14,549,32960
6538,Buick,Lucerne,2010,flex-fuel (unleaded/E85),227.0,6.0,AUTOMATIC,front wheel drive,4.0,Flex Fuel,Large,Sedan,26,17,155,32730
9967,Cadillac,SRX,2015,regular unleaded,308.0,6.0,AUTOMATIC,front wheel drive,4.0,"Crossover,Luxury",Midsize,4dr SUV,24,17,1624,48920
9633,Chevrolet,Silverado 1500,2017,regular unleaded,355.0,8.0,AUTOMATIC,rear wheel drive,4.0,Flex Fuel,Large,Crew Cab Pickup,23,16,1385,37430
1858,Volkswagen,Beetle Convertible,2015,premium unleaded (recommended),210.0,4.0,MANUAL,front wheel drive,2.0,Performance,Compact,Convertible,31,23,873,31495
7501,Honda,Passport,2001,regular unleaded,205.0,6.0,AUTOMATIC,rear wheel drive,4.0,,Midsize,4dr SUV,20,15,2202,24150
4554,Ford,F-150,2017,flex-fuel (unleaded/E85),385.0,8.0,AUTOMATIC,four wheel drive,4.0,Flex Fuel,Large,Crew Cab Pickup,21,15,5657,56260
11094,Aston Martin,V12 Vanquish,2005,premium unleaded (required),520.0,12.0,AUTOMATED_MANUAL,rear wheel drive,2.0,"Exotic,Factory Tuner,High-Performance",Compact,Coupe,16,10,259,255000


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [4]:
my_variables = ['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']
cars = df[my_variables].copy()

# Lowercase the column names and replace spaces with underscores
cars.columns = cars.columns.str.lower().str.replace(' ','_')
categorical_columns = list(cars.dtypes[cars.dtypes == 'object'].index)

for c in categorical_columns:
    cars[c] = cars[c].str.lower().str.replace(' ', '_')
    
cars.sample(5)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
4107,cadillac,escalade_esv,2015,420.0,8.0,automatic,4dr_suv,22,15,75970
506,bmw,5_series_gran_turismo,2015,445.0,8.0,automatic,4dr_hatchback,24,16,71400
7995,volkswagen,rabbit,2007,150.0,5.0,automatic,4dr_hatchback,28,19,18185
4143,cadillac,escalade_hybrid,2012,332.0,8.0,automatic,4dr_suv,23,20,83295
4972,nissan,frontier,2016,261.0,6.0,automatic,crew_cab_pickup,21,15,26750


In [5]:
# Lowercase the column names and replace spaces with underscores
cars.columns = cars.columns.str.lower().str.replace(' ','_')

In [6]:
# View missing values
cars.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [7]:
# FIll the missing values with 0
cars[['engine_hp','engine_cylinders']] = cars[['engine_hp','engine_cylinders']].fillna(value=0)

In [8]:
msrp_mean = cars['msrp'].mean()
bin_msrp = [0, msrp_mean, float('inf')]
cars['above_average'] = pd.cut(cars['msrp'], bins=bin_msrp, labels=[1,0])
cars['above_average']

0        0
1        0
2        1
3        1
4        1
        ..
11909    0
11910    0
11911    0
11912    0
11913    1
Name: above_average, Length: 11914, dtype: category
Categories (2, int64): [1 < 0]

In [9]:
#features = cars.drop('msrp', axis=1).copy()
#target = cars['msrp'].copy()

df_full_train, df_test = train_test_split(cars, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val)'abov

X_train.shape[0], X_test.shape[0], X_val.shape[0]

NameError: name 'features' is not defined