In [29]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [20]:
cancer = load_breast_cancer()

x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)

tree = DecisionTreeClassifier().fit(x_train, y_train)

print(f'Tree score train: {tree.score(x_train, y_train)}')
print(f'Tree score test: {tree.score(x_test, y_test)}')

print(f'Confusion matix train:\n {confusion_matrix(y_train, tree.predict(x_train))}')
print(f'Confusion matix test:\n {confusion_matrix(y_test, tree.predict(x_test))}')

Tree score train: 1.0
Tree score test: 0.9300699300699301
Confusion matix train:
 [[159   0]
 [  0 267]]
Confusion matix test:
 [[49  4]
 [ 6 84]]


selv om vi har et tre som sansynligvis er overfitted så ser den ut til å virke ganske godt på test settet, kan vi få det til å virke enda bedre. vi får også 4 false negativs, dette impliserer at 4 personer ville få vite at de ikke hadde kreft selv om de hadde det, noe som er veldig bekymringsverdig

In [27]:
rfc = RandomForestClassifier(n_jobs=-1).fit(x_train, y_train)
gbc = GradientBoostingClassifier().fit(x_train, y_train)

print(f'Random forest score train: {rfc.score(x_train, y_train)}')
print(f'Random forest score test: {rfc.score(x_test, y_test)}')

print(f'Confusion matix random forest train:\n {confusion_matrix(y_train, rfc.predict(x_train))}')
print(f'Confusion matix random forest test:\n {confusion_matrix(y_test, rfc.predict(x_test))}')

print(f'\n')
print(f'Gradient boosting score train: {gbc.score(x_train, y_train)}')
print(f'Gradient boosting score test: {gbc.score(x_test, y_test)}')

print(f'Confusion matix gradient boosting train:\n {confusion_matrix(y_train, gbc.predict(x_train))}')
print(f'Confusion matix gradient boosting test:\n {confusion_matrix(y_test, gbc.predict(x_test))}')

Random forest score train: 1.0
Random forest score test: 0.951048951048951
Confusion matix random forest train:
 [[159   0]
 [  0 267]]
Confusion matix random forest test:
 [[49  4]
 [ 3 87]]


Gradient boosting score train: 1.0
Gradient boosting score test: 0.958041958041958
Confusion matix gradient boosting train:
 [[159   0]
 [  0 267]]
Confusion matix gradient boosting test:
 [[48  5]
 [ 1 89]]


mer presise resultater, men antageligvis lit overfitting, det er også like mange eller flere falske positiver

In [34]:
parm_grid = {'n_estimators': [200, 400, 600, 800, 1000]}

grid_rfc = GridSearchCV(RandomForestClassifier(), parm_grid, n_jobs=-1).fit(x_train, y_train)
grid_gbc = GridSearchCV(GradientBoostingClassifier(), parm_grid, n_jobs=-1).fit(x_train, y_train)

print(f'RFC train score: {grid_rfc.score(x_train, y_train)}')
print(f'RFC test score: {grid_rfc.score(x_test, y_test)}')
print(f'RFC best parameter: {grid_rfc.best_params_}')

print(f'Confusion matix random forest train:\n {confusion_matrix(y_train, grid_rfc.predict(x_train))}')
print(f'Confusion matix random forest test:\n {confusion_matrix(y_test, grid_rfc.predict(x_test))}')

print(f'\n')
print(f'GBC train score: {grid_gbc.score(x_train, y_train)}')
print(f'GBC test score: {grid_gbc.score(x_test, y_test)}')
print(f'GBC best parameter: {grid_gbc.best_params_}')

print(f'Confusion matix gradient boosting train:\n {confusion_matrix(y_train, grid_gbc.predict(x_train))}')
print(f'Confusion matix gradient boosting test:\n {confusion_matrix(y_test, grid_gbc.predict(x_test))}')


RFC train score: 1.0
RFC test score: 0.951048951048951
RFC best parameter: {'n_estimators': 200}
Confusion matix random forest train:
 [[159   0]
 [  0 267]]
Confusion matix random forest test:
 [[49  4]
 [ 3 87]]


GBC train score: 1.0
GBC test score: 0.958041958041958
GBC best parameter: {'n_estimators': 600}
Confusion matix gradient boosting train:
 [[159   0]
 [  0 267]]
Confusion matix gradient boosting test:
 [[48  5]
 [ 1 89]]


found the best number of estimators range for both gbc and rfc, but i'm stil getting some false positives