## 1.Importando e explorando os dados

In [1]:
import pandas as pd

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 500)

In [2]:
url = "imports-85.csv"
col_names=['symboling','normalized-losses','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base',
                                    'length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size',
                                    'fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

df = pd.read_csv(url,sep=',',names = col_names ,na_values="?",  header=None)

In [3]:
df.shape

(205, 25)

In [4]:
df.head(5)

Unnamed: 0,symboling,normalized-losses,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    float64
 2   fuel-type          205 non-null    object 
 3   aspiration         205 non-null    object 
 4   num-of-doors       203 non-null    object 
 5   body-style         205 non-null    object 
 6   drive-wheels       205 non-null    object 
 7   engine-location    205 non-null    object 
 8   wheel-base         205 non-null    float64
 9   length             205 non-null    float64
 10  width              205 non-null    float64
 11  height             205 non-null    float64
 12  curb-weight        205 non-null    int64  
 13  engine-type        205 non-null    object 
 14  num-of-cylinders   205 non-null    object 
 15  engine-size        205 non-null    int64  
 16  fuel-system        205 non

In [6]:
df.columns

Index(['symboling', 'normalized-losses', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'], dtype='object')

In [7]:
# Vamos selecionar todas as colunas com valores textuais
df2 = df.select_dtypes("object")
df2.columns

Index(['fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system'], dtype='object')

## 2.Codificação de componentes ordinais

In [8]:
df2['num-of-doors'].value_counts()

num-of-doors
four    114
two      89
Name: count, dtype: int64

In [9]:
doors_mapper = {"two":2 , "four":4}

In [10]:
df2["doors"] = df2["num-of-doors"].map(doors_mapper)
df2.head()

Unnamed: 0,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system,doors
0,gas,std,two,convertible,rwd,front,dohc,four,mpfi,2.0
1,gas,std,two,convertible,rwd,front,dohc,four,mpfi,2.0
2,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi,2.0
3,gas,std,four,sedan,fwd,front,ohc,four,mpfi,4.0
4,gas,std,four,sedan,4wd,front,ohc,five,mpfi,4.0


In [11]:
df2["num-of-cylinders"].value_counts()

num-of-cylinders
four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: count, dtype: int64

In [12]:
cylinder_mapper = {"two":2,
                    "three":3,
                    "four":4,
                    "five":5,
                    "six":6,
                    "eight":8,
                    "twelve":12}

In [13]:
df2['cylinders'] = df2['num-of-cylinders'].map(cylinder_mapper)
df2

Unnamed: 0,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system,doors,cylinders
0,gas,std,two,convertible,rwd,front,dohc,four,mpfi,2.0,4
1,gas,std,two,convertible,rwd,front,dohc,four,mpfi,2.0,4
2,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi,2.0,6
3,gas,std,four,sedan,fwd,front,ohc,four,mpfi,4.0,4
4,gas,std,four,sedan,4wd,front,ohc,five,mpfi,4.0,5
...,...,...,...,...,...,...,...,...,...,...,...
200,gas,std,four,sedan,rwd,front,ohc,four,mpfi,4.0,4
201,gas,turbo,four,sedan,rwd,front,ohc,four,mpfi,4.0,4
202,gas,std,four,sedan,rwd,front,ohcv,six,mpfi,4.0,6
203,diesel,turbo,four,sedan,rwd,front,ohc,six,idi,4.0,6


## 3. Codificando dados categóricos não Ordinais

### Conhecido como "dummificação" iremos atribuir colunas separadas para essas categorias e iremos trabalhar com valores binários

In [14]:
for column in df2.columns:
    print(df2[column].value_counts())
    print("........................")

fuel-type
gas       185
diesel     20
Name: count, dtype: int64
........................
aspiration
std      168
turbo     37
Name: count, dtype: int64
........................
num-of-doors
four    114
two      89
Name: count, dtype: int64
........................
body-style
sedan          96
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: count, dtype: int64
........................
drive-wheels
fwd    120
rwd     76
4wd      9
Name: count, dtype: int64
........................
engine-location
front    202
rear       3
Name: count, dtype: int64
........................
engine-type
ohc      148
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: count, dtype: int64
........................
num-of-cylinders
four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: count, dtype: int64
........................
fuel-system
mpfi    94
2bbl    66
idi     20
1bbl    11
spdi     9
4bbl  

In [19]:
df2.columns

Index(['fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system', 'doors', 'cylinders'], dtype='object')

In [20]:
# Aqui a magia acontece
df2 = pd.get_dummies(df2,columns = ['drive-wheels','fuel-system','engine-type','body-style'])

In [21]:
df2.head()

Unnamed: 0,fuel-type,aspiration,num-of-doors,engine-location,num-of-cylinders,doors,cylinders,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi,engine-type_dohc,engine-type_dohcv,engine-type_l,engine-type_ohc,engine-type_ohcf,engine-type_ohcv,engine-type_rotor,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon
0,gas,std,two,front,four,2.0,4,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False
1,gas,std,two,front,four,2.0,4,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False
2,gas,std,two,front,six,2.0,6,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False
3,gas,std,four,front,four,4.0,4,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False
4,gas,std,four,front,five,4.0,5,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False


### Para aspiration e fuel type como de fato temos só duas respostas possiveis, binarias, podemos somente dizer se um recurso eh de fato Verdade ou Não

In [26]:
df2 = pd.get_dummies(df2,columns=['aspiration','fuel-type'], drop_first=True)
# Drop first excluirá o primeiro valor, no caso std

In [27]:
df2.head()

Unnamed: 0,num-of-doors,engine-location,num-of-cylinders,doors,cylinders,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi,engine-type_dohc,engine-type_dohcv,engine-type_l,engine-type_ohc,engine-type_ohcf,engine-type_ohcv,engine-type_rotor,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon,aspiration_turbo,fuel-type_gas
0,two,front,four,2.0,4,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True
1,two,front,four,2.0,4,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True
2,two,front,six,2.0,6,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True
3,four,front,four,4.0,4,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True
4,four,front,five,4.0,5,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True


In [28]:
df2.drop(columns= ['num-of-doors','engine-location','num-of-cylinders'])

Unnamed: 0,doors,cylinders,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi,engine-type_dohc,engine-type_dohcv,engine-type_l,engine-type_ohc,engine-type_ohcf,engine-type_ohcv,engine-type_rotor,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon,aspiration_turbo,fuel-type_gas
0,2.0,4,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True
1,2.0,4,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True
2,2.0,6,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True
3,4.0,4,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True
4,4.0,5,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,4.0,4,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True
201,4.0,4,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,True
202,4.0,6,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True
203,4.0,6,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False
