In [6]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

np.random.seed(20)

In [7]:
df = pd.read_csv('https://gist.githubusercontent.com/guilhermesilveira/e99a526b2e7ccc6c3b70f53db43a87d2/raw/1605fc74aa778066bf2e6695e24d53cf65f2f447/machine-learning-carros-simulacao.csv')
df = df.drop(columns=['Unnamed: 0'], axis=1)

In [8]:
df.head()

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
0,30941.02,1,18,35085.22134
1,40557.96,1,20,12622.05362
2,89627.5,0,12,11440.79806
3,95276.14,0,3,43167.32682
4,117384.68,1,4,12770.1129


In [9]:
X = df[df.columns.drop('vendido')]
y = df['vendido']

In [10]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
d_pred = dummy.predict(X_test)
print(accuracy_score(y_test, d_pred))

0.5192




In [11]:
model = DecisionTreeClassifier(max_depth=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7584

In [12]:
results = cross_validate(model, X, y, cv=5)
avg = results['test_score'].mean()
standard_deviation = results['test_score'].std()
print(f'{(avg-(2*standard_deviation))*100:.2f}% > {avg*100:.2f}% > {(avg+(2*standard_deviation))*100:.2f}%')

75.21% > 75.78% > 76.35%


## Aleatoriedade no cross validate

In [13]:
def print_results(results):
    avg = results['test_score'].mean()
    standard_deviation = results['test_score'].std()
    print(f'{(avg-(2*standard_deviation))*100:.2f}% > {avg*100:.2f}% > {(avg+(2*standard_deviation))*100:.2f}%')

In [14]:
cv = KFold(n_splits=10)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, X, y, cv=cv)
print_results(results)

74.37% > 75.78% > 77.19%


In [15]:
cv = KFold(n_splits=10, shuffle=True)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, X, y, cv=cv)
print_results(results)

73.33% > 75.78% > 78.23%


In [16]:
cv = StratifiedKFold(n_splits=10, shuffle=True)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, X, y, cv=cv)
print_results(results)

73.72% > 75.78% > 77.84%


In [17]:
df['modelo_aleatorio'] = df.idade_do_modelo +np.random.randint(-2, 2, size=df.shape[0])

In [18]:
df.modelo_aleatorio = df.modelo_aleatorio.abs()

In [19]:
df.modelo_aleatorio.value_counts()

18    900
17    831
16    825
15    722
19    692
14    670
13    658
12    630
11    556
10    550
20    491
9     441
8     386
7     321
6     310
21    254
5     252
4     191
3     165
2      73
1      59
0      23
Name: modelo_aleatorio, dtype: int64

In [20]:
cv = GroupKFold(n_splits=10)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, X, y, groups=df['modelo_aleatorio'], cv=cv)
print_results(results)

71.20% > 75.78% > 80.36%


In [21]:
scaler = StandardScaler()
scaler.fit(X_train)
S_train= scaler.transform(X_train)
S_test = scaler.transform(X_test)


In [22]:
model = SVC()
model.fit(S_train, y_train)
s_pred = model.predict(S_test)
accuracy_score(y_test, s_pred)

0.7592

In [23]:
scaler = StandardScaler()
model = SVC()

pipeline = Pipeline([('transformacao', scaler), ('estimador', model)])
cv = GroupKFold(n_splits=10)
results = cross_validate(pipeline, X, y, groups=df['modelo_aleatorio'], cv=cv)
print_results(results)

71.76% > 76.66% > 81.56%
