# Introdução à Ciência de Dados - Projeto 1

### Equipe:
> Pedro Henrique Almeida Girão Peixinho (phagp)

> Victor Gabriel de Carvalho (vgc3)

### Tópicos Avançados em Gerenciamento de Dados e Informação - IF697 - 2024.1

## Imports e Configurações

In [100]:
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from fancyimpute import KNN

# from google.colab import drive
# drive.mount('/content/drive')

In [101]:
df = pd.read_csv('cars_backup.csv')
df.shape

(1165, 11)

---

## Pré-processamento dos Dados

In [102]:
df.head()

Unnamed: 0,page,car_name,car_price,car_km,car_year,car_desc,car_store,car_engine,car_gearbox,car_fuel,car_color
0,https://www.pedragonchevroletrecife.com.br/sem...,CHEVROLET ONIX,79990.0,3902.0,2024,1.0 FLEX MANUAL (Manual),Pedragon Av Norte,1.0,Manual,Flex,Branco
1,https://www.pedragonchevroletrecife.com.br/sem...,CHEVROLET ONIX,86000.0,3530.0,2024,1.0 TURBO FLEX LT MANUAL (Manual),Pedragon Afogados,1.0,Manual,Flex,Cinza
2,https://www.pedragonchevroletrecife.com.br/sem...,CHEVROLET ONIX,90000.0,1491.0,2024,1.0 TURBO FLEX LT MANUAL (Manual),Pedragon Afogados,1.0,Manual,Flex,Preto
3,https://www.pedragonchevroletrecife.com.br/sem...,CHEVROLET ONIX,93990.0,7738.0,2024,1.0 TURBO FLEX LT MANUAL (Manual),Pedragon Beberibe,1.0,Manual,Flex,Prata
4,https://www.pedragonchevroletrecife.com.br/sem...,CHEVROLET ONIX PLUS,96990.0,2312.0,2024,1.0 TURBO FLEX LT MANUAL (Manual),Pedragon Rui Barbosa,1.0,Manual,Flex,Cinza


In [103]:
df.drop_duplicates(inplace=True)
df.shape

(1161, 11)

In [104]:
df.drop(['page', 'car_desc'], axis=1, inplace=True)

In [105]:
df.describe()

Unnamed: 0,car_price,car_km,car_year,car_engine
count,1161.0,1161.0,1161.0,1142.0
mean,106730.24031,45117.399655,2020.11714,1.46331
std,101925.157921,33385.801495,3.039123,0.528266
min,24990.0,0.0,2000.0,1.0
25%,59990.0,24359.0,2019.0,1.0
50%,77990.0,43356.0,2021.0,1.4
75%,109990.0,61671.0,2022.0,1.8
max,968900.0,654000.0,2025.0,6.7


---

### Definição de Tipos

In [106]:
df.dtypes

car_name        object
car_price      float64
car_km         float64
car_year         int64
car_store       object
car_engine     float64
car_gearbox     object
car_fuel        object
car_color       object
dtype: object

In [107]:
df['car_year'] = df['car_year'].astype('int')
df['car_gearbox'] = df['car_gearbox'].astype('category')
df['car_fuel'] = df['car_fuel'].astype('category')
df['car_color'] = df['car_color'].astype('category')

df.dtypes

car_name         object
car_price       float64
car_km          float64
car_year          int32
car_store        object
car_engine      float64
car_gearbox    category
car_fuel       category
car_color      category
dtype: object

In [108]:
print(df['car_gearbox'].cat.categories)
print(df['car_fuel'].cat.categories)
print(df['car_color'].cat.categories)

Index(['Automatico', 'Manual'], dtype='object')
Index(['DIESEL', 'Diesel', 'ELÉTRICO', 'Elétrico', 'Flex', 'GNV', 'Gasolina',
       'Gasolina, alcool e gas natural', 'Híbrido'],
      dtype='object')
Index(['Azul', 'Bege', 'Branco', 'Cinza', 'Dourado', 'Laranja', 'Marrom',
       'Prata', 'Preto', 'Verde', 'Vermelho'],
      dtype='object')


---

### Tratamento de Dados Ausentes

In [109]:
df.isnull().sum()

car_name          0
car_price         0
car_km            0
car_year          0
car_store      1153
car_engine       19
car_gearbox       0
car_fuel         28
car_color       965
dtype: int64

In [110]:
df.dropna(thresh=0.7*len(df), axis=1, inplace=True)
df.columns

Index(['car_name', 'car_price', 'car_km', 'car_year', 'car_engine',
       'car_gearbox', 'car_fuel'],
      dtype='object')

In [111]:
df['car_gearbox'] = df['car_gearbox'].cat.codes
df['car_fuel'] = df['car_fuel'].cat.codes

In [112]:
df_knn = KNN(k=5).fit_transform(df.drop('car_name', axis=1))
df_knn.shape

Imputing row 1/1161 with 0 missing, elapsed time: 0.147
Imputing row 101/1161 with 0 missing, elapsed time: 0.148
Imputing row 201/1161 with 0 missing, elapsed time: 0.148
Imputing row 301/1161 with 0 missing, elapsed time: 0.149
Imputing row 401/1161 with 0 missing, elapsed time: 0.149
Imputing row 501/1161 with 0 missing, elapsed time: 0.149
Imputing row 601/1161 with 0 missing, elapsed time: 0.149
Imputing row 701/1161 with 0 missing, elapsed time: 0.149
Imputing row 801/1161 with 0 missing, elapsed time: 0.149
Imputing row 901/1161 with 0 missing, elapsed time: 0.149
Imputing row 1001/1161 with 0 missing, elapsed time: 0.149
Imputing row 1101/1161 with 0 missing, elapsed time: 0.149


(1161, 6)

In [113]:
df_fill = pd.DataFrame(data=df_knn[0:,0:],columns=[df.columns[1:]])
df_fill['car_name'] = df['car_name']
df_fill.head()

Unnamed: 0,car_price,car_km,car_year,car_engine,car_gearbox,car_fuel,car_name
0,79990.0,3902.0,2024.0,1.0,1.0,4.0,CHEVROLET ONIX
1,86000.0,3530.0,2024.0,1.0,1.0,4.0,CHEVROLET ONIX
2,90000.0,1491.0,2024.0,1.0,1.0,4.0,CHEVROLET ONIX
3,93990.0,7738.0,2024.0,1.0,1.0,4.0,CHEVROLET ONIX
4,96990.0,2312.0,2024.0,1.0,1.0,4.0,CHEVROLET ONIX PLUS


In [114]:
df_fill.isnull().sum()

car_price      0
car_km         0
car_year       0
car_engine     0
car_gearbox    0
car_fuel       0
car_name       4
dtype: int64

---

In [116]:
fig = make_subplots(rows=4, cols=1)

fig.append_trace(go.Histogram(x=df['car_price'], name='Price'), row=1, col=1)

fig.append_trace(go.Histogram(x=df['car_km'], name='Mileage'), row=2, col=1)

fig.append_trace(go.Histogram(x=df['car_year'], name='Release Year'), row=3, col=1)

fig.append_trace(go.Histogram(x=df['car_engine'], name='Engine Size'), row=4, col=1)

fig.update_layout(height=900, width=970, title_text="Numerical Columns Histogram")
fig.show()

---

## Referêncais

- https://plotly.com/python/histograms/
- https://plotly.com/python/legend/