In [1]:
import pandas as pd
import dask.dataframe as dd
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import graphviz

from py.consts import DATA_DIR


initial_sample_size = 1000

In [2]:

num_partitions = 4
parquet_files = [f"/Users/simonverhoek/Documents/programming/projects/flitsers/notebook/data/partition_{i}.parquet" for i in range(4)]
print(parquet_files)

['/Users/simonverhoek/Documents/programming/projects/flitsers/notebook/data/partition_0.parquet', '/Users/simonverhoek/Documents/programming/projects/flitsers/notebook/data/partition_1.parquet', '/Users/simonverhoek/Documents/programming/projects/flitsers/notebook/data/partition_2.parquet', '/Users/simonverhoek/Documents/programming/projects/flitsers/notebook/data/partition_3.parquet']


In [3]:
data = pd.read_parquet(DATA_DIR / f"encoded_test.parquet")

print(f"{len(data)} rows")
data.head()

4385206 rows


Unnamed: 0_level_0,y,zijde_links,zijde_rechts,month_August,month_December,month_February,month_January,month_July,month_June,month_March,...,stop_hour_14,stop_hour_15,stop_hour_16,stop_hour_17,stop_hour_18,stop_hour_19,stop_hour_20,stop_hour_21,stop_hour_22,stop_hour_23
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
53100,1,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91510,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
50223,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4439,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
34728,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [4]:
X = data.drop("y", axis=1)
y = data[["y"]]

In [5]:
X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.4)
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, test_size=0.50)

print("y_train")
print(f"length: {len(y_train)}")
print(f"positive: {len(y_train[y_train.y == 1])} ({len(y_train[y_train.y == 1]) / len(y_train) * 100:.2f}%) negative: {len(y_train[y_train.y == 0])}")
print()
print("y_")
print(f"length: {len(y_)}")
print(f"positive: {len(y_[y_.y == 1])} ({len(y_[y_.y == 1]) / len(y_) * 100:.2f}%) negative: {len(y_[y_.y == 0])}")
print()
print("y_cv")
print(f"length: {len(y_cv)}")
print(f"positive: {len(y_cv[y_cv.y == 1])} ({len(y_cv[y_cv.y == 1]) / len(y_cv) * 100:.2f}%) negative: {len(y_cv[y_cv.y == 0])}")
print()
print("y_test")
print(f"length: {len(y_test)}")
print(f"positive: {len(y_test[y_test.y == 1])} ({len(y_test[y_test.y == 1]) / len(y_test) * 100:.2f}%) negative: {len(y_test[y_test.y == 0])}")

y_train
length: 2631123
positive: 27955 (1.06%) negative: 2603168

y_
length: 1754083
positive: 18751 (1.07%) negative: 1735332

y_cv
length: 877041
positive: 9370 (1.07%) negative: 867671

y_test
length: 877042
positive: 9381 (1.07%) negative: 867661


In [7]:
"""
DecisiontreeClassifier
"""
# param_grid = {
#     'max_depth': [1, 2, 3],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }
# 
# grid_search = GridSearchCV(DecisionTreeClassifier(criterion="entropy"), param_grid, cv=2, verbose=3, scoring="accuracy")
# grid_search.fit(X_train, y_train)
"""
RandomForestClassifier
"""
param_grid = {
    'n_estimators': [100, 200, 300],
    "criterion": ["gini", "entropy", "log_loss"],
    'max_depth': [None, 1, 2, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, scoring="accuracy", cv=3, verbose=3)
# grid_search.fit(X_train, y_train.values.ravel())
# print(grid_search.feature_importances)

clf_model = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=2)
clf_model.fit(X_train, y_train.values.ravel())

print("Done training!")

AttributeError: 'RandomForestClassifier' object has no attribute 'feature_importances'

In [10]:
print(clf_model.feature_names_in_)
print(clf_model.feature_importances_)
# Access the best hyperparameters
print(grid_search.best_params_)
print(grid_search.best_estimator_)

['zijde_links' 'zijde_rechts' 'month_August' 'month_December'
 'month_February' 'month_January' 'month_July' 'month_June' 'month_March'
 'month_May' 'month_November' 'month_October' 'month_September'
 'day_Monday' 'day_Saturday' 'day_Sunday' 'day_Thursday' 'day_Tuesday'
 'day_Wednesday' 'wegnummer_A10' 'wegnummer_A12' 'wegnummer_A13'
 'wegnummer_A15' 'wegnummer_A16' 'wegnummer_A17' 'wegnummer_A18'
 'wegnummer_A2' 'wegnummer_A20' 'wegnummer_A200' 'wegnummer_A208'
 'wegnummer_A22' 'wegnummer_A256' 'wegnummer_A27' 'wegnummer_A270'
 'wegnummer_A28' 'wegnummer_A29' 'wegnummer_A30' 'wegnummer_A31'
 'wegnummer_A32' 'wegnummer_A325' 'wegnummer_A326' 'wegnummer_A348'
 'wegnummer_A35' 'wegnummer_A37' 'wegnummer_A4' 'wegnummer_A44'
 'wegnummer_A5' 'wegnummer_A50' 'wegnummer_A58' 'wegnummer_A59'
 'wegnummer_A6' 'wegnummer_A65' 'wegnummer_A67' 'wegnummer_A7'
 'wegnummer_A73' 'wegnummer_A74' 'wegnummer_A76' 'wegnummer_A77'
 'wegnummer_A79' 'wegnummer_A8' 'wegnummer_A9' 'wegnummer_N11'
 'wegnummer_N1

NameError: name 'grid_search' is not defined

In [None]:
# clf_model = DecisionTreeClassifier(criterion="entropy", **grid_search.best_params_)
clf_model = grid_search.best_estimator_

In [11]:
import pickle
with open(DATA_DIR / "model.pkl", "wb") as f:
    pickle.dump(clf_model,f)

In [None]:
clf_model.fit(X_train, y_train)

In [28]:
y_predict = clf_model.predict(X_)

In [29]:
accuracy_score(y_, y_predict)

0.9920596687842024

In [32]:
class_names = list([str(v) for v in y.y.unique()])
feature_names = list(X.columns)

In [34]:
# cross validation
y_cv_predict = clf_model.predict(X_cv)
print(f"cross-validation dataset accuracy: {accuracy_score(y_cv, y_cv_predict)}")

# Generate a classification report
report = classification_report(y_, y_predict)
print("\nClassification Report:\n", report)

# Compute the confusion matrix
cm = confusion_matrix(y_, y_predict)
print("\nConfusion Matrix:\n", cm)

scores = cross_val_score(clf_model, X_train, y_train, cv=2)  # 5-fold cross-validation
print("\nCross-Validation Scores:", scores)

cross-validation dataset accuracy: 0.9919433641072652

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00   1735432
           1       0.99      0.26      0.41     18651

    accuracy                           0.99   1754083
   macro avg       0.99      0.63      0.70   1754083
weighted avg       0.99      0.99      0.99   1754083


Confusion Matrix:
 [[1735386      46]
 [  13882    4769]]

Cross-Validation Scores: [0.99201938 0.99200645]


In [35]:
import seaborn as sns
import matplotlib.pyplot as plt

results = pd.DataFrame(grid_search.cv_results_)
pivot_table = results.pivot_table(index='param_max_depth', columns='param_min_samples_split', values='mean_test_score')
sns.heatmap(pivot_table, annot=True, fmt=".3f", cmap="YlGnBu")
plt.xlabel('min_samples_split')
plt.ylabel('max_depth')
plt.show()

ModuleNotFoundError: No module named 'seaborn'

In [16]:
plot_tree(clf_model, feature_names=feature_names, class_names=class_names)

ModuleNotFoundError: No module named 'matplotlib'

In [17]:
target = list([str(v) for v in y.y.unique()])
feature_names = list(X.columns)
dot_data = tree.export_graphviz(clf_model,
                                out_file=None, 
                      feature_names=feature_names,  
                      class_names=target,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  

graph

ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x2976df050>

In [18]:
from sklearn.tree import export_text
r = export_text(clf_model, feature_names=feature_names)
print(r)

|--- stop_hour_23 <= 0.50
|   |--- start_hour_8 <= 0.50
|   |   |--- start_hour_9 <= 0.50
|   |   |   |--- start_hour_10 <= 0.50
|   |   |   |   |--- start_hour_15 <= 0.50
|   |   |   |   |   |--- start_hour_14 <= 0.50
|   |   |   |   |   |   |--- start_hour_11 <= 0.50
|   |   |   |   |   |   |   |--- start_hour_7 <= 0.50
|   |   |   |   |   |   |   |   |--- start_hour_13 <= 0.50
|   |   |   |   |   |   |   |   |   |--- start_hour_16 <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- start_hour_16 >  0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- start_hour_13 >  0.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- start_hour_7 >  0.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- start_hour_11 >  0.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- start_hour_14 >  0.50
|   |   |   |   |   |   |--- class