# Prepare DF

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

module_path

'/home/tjanicek/thesis/thoth-issue-predictor'

In [2]:
import graphviz
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from thoth_issue_predictor.utils.grid_search import grid_search
from catboost import Pool, CatBoostClassifier, cv

from thoth_issue_predictor.utils.preprocessing import (
    prepare_df,
    create_python_version_packege_df,
)

In [3]:
FILE_NAME = "../datasets/dataset.zip"
inspections_df = prepare_df(FILE_NAME)
exit_codes = inspections_df["exit_code"]

In [4]:
(
    issues_df,
    _,
    python_indexes,
) = create_python_version_packege_df(inspections_df=inspections_df)
issues_df["exit_code"] = exit_codes.astype("int")
issues_df = issues_df.loc[
    (issues_df.loc[:, issues_df.columns != "exit_code"] != 0).any(axis=1)
]
issues_df = issues_df.drop_duplicates()

In [5]:
python_indexes

['unknown', 'pypi-org-simple', 'pypi-org', 'aicoe', 'pypi']

In [6]:
print(f"Length of DF is {len(issues_df)}")
issues_df.head(5)

Length of DF is 1155


Unnamed: 0,python,tensorflow_major,tensorflow_minor,tensorflow_patch,tensorflow_index,flatbuffers_major,flatbuffers_minor,flatbuffers_patch,flatbuffers_index,six_major,...,keras_index,pyyaml_major,pyyaml_minor,pyyaml_patch,pyyaml_index,tensorflow-cpu_major,tensorflow-cpu_minor,tensorflow-cpu_patch,tensorflow-cpu_index,exit_code
0,38,2,4,1,1,1,12,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,38,2,4,1,1,1,12,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,38,2,4,1,1,1,12,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,36,2,1,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,38,2,4,1,1,1,12,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [7]:
issues_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1155 entries, 0 to 1200
Columns: 210 entries, python to exit_code
dtypes: int64(210)
memory usage: 1.9 MB


# Failed inspections

In [8]:
failed_inspections = issues_df[issues_df["exit_code"] == 1]
print(f"Number of failed inspections: {len(failed_inspections)}")
failed_inspections.head()

Number of failed inspections: 102


Unnamed: 0,python,tensorflow_major,tensorflow_minor,tensorflow_patch,tensorflow_index,flatbuffers_major,flatbuffers_minor,flatbuffers_patch,flatbuffers_index,six_major,...,keras_index,pyyaml_major,pyyaml_minor,pyyaml_patch,pyyaml_index,tensorflow-cpu_major,tensorflow-cpu_minor,tensorflow-cpu_patch,tensorflow-cpu_index,exit_code
1097,36,2,1,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1098,36,2,1,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1099,36,2,1,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1100,36,2,1,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1101,36,2,1,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [9]:
issue_x, exit_codes = issues_df.drop("exit_code", axis=1), issues_df["exit_code"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    issue_x, exit_codes, test_size=0.33, random_state=42
)

# Decision Trees

In [11]:
# best_params = grid_search(issue_x, exit_codes)
best_params = {}

In [12]:
clf = DecisionTreeClassifier(**best_params)
cross_val_score(clf, issue_x, exit_codes, cv=10)

array([1.        , 1.        , 1.        , 1.        , 0.99137931,
       1.        , 0.99130435, 1.        , 0.99130435, 0.99130435])

In [13]:
clf.fit(issue_x, exit_codes)

DecisionTreeClassifier()

In [14]:
feature_names = issue_x.columns
target_names = ["successful", "failed"]
dot_data = export_graphviz(
    clf,
    out_file=None,
    feature_names=feature_names,
    class_names=target_names,
    filled=True,
    rounded=True,
    special_characters=True,
)
graph = graphviz.Source(dot_data)
graph.render("trees/ThothIssuePredictor", format="pdf")

'trees/ThothIssuePredictor.pdf'

## CatBoost Decision tree model

In [15]:
cat_features = [f for f in feature_names if "_index" in f]

issue_pool = Pool(issue_x, exit_codes, cat_features=cat_features)
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

In [16]:
params = {
    "iterations": 1000,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "verbose": False,
    "max_depth": 4,
    "use_best_model": True,
}
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=test_pool)

<catboost.core.CatBoostClassifier at 0x7fbfff04a9d0>

In [17]:
dot_data = model.plot_tree(5, pool=train_pool)
graph = graphviz.Source(dot_data)
graph.render("trees/ThothIssuePredictorBoost", format="pdf")

'trees/ThothIssuePredictorBoost.pdf'

In [18]:
scores = cv(issue_pool, params, fold_count=2)

In [19]:
scores

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.954960,0.007816,0.623425,0.001968,0.622717,0.000376
1,1,0.974542,0.004537,0.565738,0.012062,0.565798,0.009917
2,2,0.982789,0.008002,0.512249,0.009236,0.511424,0.007609
3,3,0.985406,0.006432,0.468349,0.018789,0.467764,0.017121
4,4,0.987592,0.009734,0.421707,0.017859,0.421007,0.016913
...,...,...,...,...,...,...,...
995,995,0.995266,0.006695,0.028982,0.031662,0.000451,0.000385
996,996,0.995266,0.006695,0.028979,0.031666,0.000451,0.000384
997,997,0.995266,0.006695,0.028981,0.031664,0.000451,0.000384
998,998,0.995266,0.006695,0.028978,0.031668,0.000450,0.000384
