# Description

In this notebook I am building a model with ExtraTreesClassifier, tuning it with GridSearchCV.

After that - I am using validation set to analyse the quality of model prediction and trying to remove some features to improve it


In [65]:
import pandas as pd
import os
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
import multiprocessing
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import MinMaxScaler
import joblib

In [2]:
pd.set_option('display.max_columns', None)

In [49]:
screenshots = pd.read_csv("./images/screenshots/screenshots.csv")

screenshots_ph = pd.read_csv("./images/screenshots_ph/screenshots_ph.csv")

receipts = pd.read_csv("./images/receipts/receipts.csv")

regulars = pd.read_csv("./images/regular/regular.csv")

not_good = pd.read_csv("./images/not_good/not_good.csv")

superb = pd.read_csv("./images/superb/superb.csv")

docs = pd.read_csv("./images/docs/docs.csv")

test = pd.read_csv("./images/test/test.csv")


In [None]:
# from_labels = ["docs", "not_good", "receipts", "regular", "screenshots", "screenshots_ph", "superb", "test"]

label = "docs"

regulars[label] = 0
not_good[label] = 0
superb[label] = 0
screenshots[label] = 0
docs[label] = 1
screenshots_ph[label] = 0
receipts[label] = 0


whole_docs = pd.concat([regulars, not_good, superb, screenshots, docs, receipts, screenshots_ph], ignore_index=True)

In [50]:
whole_docs = whole_docs.drop(columns=["filename"])

In [51]:
y = whole_docs.pop(label)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(whole_docs, y, test_size=0.15, random_state=42)

In [53]:
full_pipeline = make_pipeline(SimpleImputer(), MinMaxScaler(),ExtraTreesClassifier())

In [54]:
param_grid = {
    'extratreesclassifier__n_estimators': range(100,400,50),
    'extratreesclassifier__min_samples_split': range(2,19,2),
    'extratreesclassifier__min_samples_leaf': range(1,13,2),
}


In [55]:
search_scr = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=10,
                      verbose=1,
                      n_jobs=multiprocessing.cpu_count()-1)

In [56]:
%%time
search_scr.fit(X_train, y_train)

Fitting 10 folds for each of 324 candidates, totalling 3240 fits
CPU times: user 9.06 s, sys: 3.09 s, total: 12.1 s
Wall time: 2min 54s


In [57]:
search_scr.best_score_

0.9656491228070176

In [58]:
search_scr.best_params_

{'extratreesclassifier__min_samples_leaf': 1,
 'extratreesclassifier__min_samples_split': 4,
 'extratreesclassifier__n_estimators': 300}

In [59]:
filenames = test.pop("filename")

In [60]:
prediction = search_scr.predict(test)

In [61]:
result = pd.DataFrame()
result['filename'] = filenames
result['prediction'] = prediction

In [62]:
name_to_test = "screen"

In [63]:
result.loc[(result.prediction == 1) & (result.filename.str.contains(name_to_test)), "metric"] = "true"
result.loc[(result.prediction == 0) & (result.filename.str.contains(name_to_test)), "metric"] = "FN"
result.loc[(result.prediction == 0) & (~result.filename.str.contains(name_to_test)), "metric"] = "not true"
result.loc[(result.prediction == 1) & (~result.filename.str.contains(name_to_test)), "metric"] = "FP"

In [64]:
result.groupby("metric").count()

Unnamed: 0_level_0,filename,prediction
metric,Unnamed: 1_level_1,Unnamed: 2_level_1
FP,7,7
not true,54,54
true,30,30


In [66]:
result.loc[result.metric == "FP"]

Unnamed: 0,filename,prediction,metric
3,screenshot_ph1.JPG,0,FP
10,screenshots_ph.JPG,0,FP
16,screenshots_ph (9).JPG,0,FP
29,screenshots_ph777.JPG,0,FP
41,screenshot_ph.JPG,0,FP
76,screenshots_ph (6).JPG,0,FP
82,screenshot_1.PNG,0,FP


## Saving the model

In [None]:
label

In [67]:
joblib.dump(search_scr, f"./models/{label}_et.model")

['./models/screenshots_et.model']

In [20]:
from lofo import LOFOImportance, Dataset, plot_importance
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import get_scorer_names

  from tqdm.autonotebook import tqdm


In [21]:
cv = KFold(n_splits=10, shuffle=True, random_state=42) 

In [22]:
test_lofo = whole_docs.copy()
test_lofo[label] = y.copy()

In [23]:
dataset = Dataset(df=test_lofo, target=label, features=[col for col in test_lofo.columns if col != label])


In [24]:
%%time
lofo_imp = LOFOImportance(dataset, cv=cv, model=full_pipeline, scoring="accuracy")

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 10 µs


In [29]:
%%time
importance_df = lofo_imp.get_importance()

  0%|          | 0/50 [00:00<?, ?it/s]

CPU times: user 39.6 s, sys: 149 ms, total: 39.7 s
Wall time: 39.8 s


In [30]:
plot_importance(importance_df, figsize=(12, 20))


AttributeError: module 'matplotlib.cbook' has no attribute '_safe_first_finite'

<Figure size 1200x2000 with 0 Axes>

In [None]:
features_to_drop = importance_df.sort_values("importance_mean").head(11)["feature"]

In [None]:
whole_doc_less_features = whole_docs.drop(columns=features_to_drop)

In [None]:
%%time
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(whole_doc_less_features, y, test_size=0.15, random_state=42)

full_pipeline_l = make_pipeline(SimpleImputer(), MinMaxScaler(),ExtraTreesClassifier())

param_grid_l = {
    'extratreesclassifier__n_estimators': range(50,500,50),
    'extratreesclassifier__min_samples_split': range(2,19,2),
    'extratreesclassifier__min_samples_leaf': range(1,13,2),
}


search_scr_l = GridSearchCV(full_pipeline_l,
                      param_grid_l,
                      cv=10,
                      verbose=1,
                      n_jobs=multiprocessing.cpu_count()-1)


search_scr_l.fit(X_train_l, y_train_l)

search_scr_l.best_score_

In [None]:
search_scr.best_params_

In [None]:
test_l = test.drop(columns=features_to_drop)


prediction_l = search_scr_l.predict(test_l)

result_l = pd.DataFrame()
result_l['filename'] = filenames
result_l['prediction'] = prediction

result_l.loc[(result_l.prediction == 1) & (result_l.filename.str.contains("docs")), "metric"] = "true"
result_l.loc[(result_l.prediction == 0) & (result_l.filename.str.contains("docs")), "metric"] = "FP"
result_l.loc[(result_l.prediction == 0) & (~result_l.filename.str.contains("docs")), "metric"] = "not true"
result_l.loc[(result_l.prediction == 1) & (~result_l.filename.str.contains("docs")), "metric"] = "FN"

result_l.groupby("metric").count()