In [193]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

from datetime import datetime

In [194]:
uri = "https://gist.githubusercontent.com/guilhermesilveira/4d1d4a16ccbf6ea4e0a64a38a24ec884/raw/afd05cb0c796d18f3f5a6537053ded308ba94bf7/car-prices.csv"
df = pd.read_csv(uri)

In [195]:
df = df[['mileage_per_year', 'model_year', 'price', 'sold']]
df.shape

(10000, 4)

In [196]:
df.head()

Unnamed: 0,mileage_per_year,model_year,price,sold
0,21801,2000,30941.02,yes
1,7843,1998,40557.96,yes
2,7109,2006,89627.5,no
3,26823,2015,95276.14,no
4,7935,2014,117384.68,yes


In [197]:
change_sold = {
    'no': 0,
    'yes': 1
}
df['sold'] = df['sold'].map(change_sold)
df.head()

Unnamed: 0,mileage_per_year,model_year,price,sold
0,21801,2000,30941.02,1
1,7843,1998,40557.96,1
2,7109,2006,89627.5,0
3,26823,2015,95276.14,0
4,7935,2014,117384.68,1


In [198]:
current_year = datetime.now().date().year

df['model_age'] = current_year - df['model_year']

In [199]:
df.head()

Unnamed: 0,mileage_per_year,model_year,price,sold,model_age
0,21801,2000,30941.02,1,23
1,7843,1998,40557.96,1,25
2,7109,2006,89627.5,0,17
3,26823,2015,95276.14,0,8
4,7935,2014,117384.68,1,9


In [200]:
df['km_per_year'] = df['mileage_per_year'] * 1.60934
df.head()

Unnamed: 0,mileage_per_year,model_year,price,sold,model_age,km_per_year
0,21801,2000,30941.02,1,23,35085.22134
1,7843,1998,40557.96,1,25,12622.05362
2,7109,2006,89627.5,0,17,11440.79806
3,26823,2015,95276.14,0,8,43167.32682
4,7935,2014,117384.68,1,9,12770.1129


In [201]:
df_n = df[['price', 'sold', 'model_age', 'km_per_year']] 
df_n.head()

Unnamed: 0,price,sold,model_age,km_per_year
0,30941.02,1,23,35085.22134
1,40557.96,1,25,12622.05362
2,89627.5,0,17,11440.79806
3,95276.14,0,8,43167.32682
4,117384.68,1,9,12770.1129


In [202]:
x = df[['price', 'model_age', 'km_per_year']]
y = df['sold']

In [203]:
SEED = 5

np.random.seed(SEED)

train_x, test_x, train_y, test_y = train_test_split(
    x, y, 
    random_state=SEED, 
    test_size=0.25, 
    stratify=y,
)

In [212]:
dummy = DummyClassifier()
dummy.fit(train_x, train_y)

# y_pred = dummy.predict(test_x)
# accuracy = accuracy_score(test_y, y_pred) * 100

# ja testa a acuracia do modelo por padrão
accuracy = dummy.score(test_x, test_y) * 100 

print(f"Treinaremos com {len(train_x)} e testaremos com {len(test_x)} elementos")
print("Acurácia do modelo foi: %.2f%%" % accuracy)

Treinaremos com 7500 e testaremos com 2500 elementos
Acurácia do modelo foi: 58.00%
