# Notebook error process 

### filter out error notebooks with uninteresting exception types and error notebooks not using any of the top ML libraries

### 1 mark only the ones using the selected libraries -> is_MLnb


### 1.1 nbs from github

In [1]:
import pandas as pd
import util
import config

df2_err = pd.read_excel(config.path_github_error_process+'/nberror_g_all_p.xlsx')
df2_err["is_MLnb"] = df2_err.lib_alias.apply(util.lib_alias_isML)

In [7]:
df2_err.to_excel(config.path_github_error_process+'/nberror_g_all_p.xlsx', index=False, engine="xlsxwriter")

In [5]:
print("{0:.2%} of all the python GitHub notebooks(containing errors) use the selected ML libraries".format(sum(df2_err[["fname","is_MLnb"]].drop_duplicates().is_MLnb)/df2_err.fname.nunique()))

80.32% of all the python GitHub notebooks(containing errors) use the selected ML libraries


### 1.2 nbs from kaggle

In [1]:
import pandas as pd
import util
import config

df_err = pd.read_excel(config.path_kaggle_error_process + '/nberror_k_p.xlsx')
df_err["is_MLnb"] = df_err.lib_alias.apply(util.lib_alias_isML)

In [3]:
print("There are {0:.2%} of all error Kaggle notebooks using the selected ML libraries".format(sum(df_err[["fname","is_MLnb"]].drop_duplicates().is_MLnb)/df_err.fname.nunique()))

There are 93.45% of all error Kaggle notebooks using the selected ML libraries


In [4]:
df_err.to_excel(config.path_kaggle_error_process + '/nberror_k_p.xlsx', index=False, engine="xlsxwriter")

In [5]:
df_err

Unnamed: 0,fname,ename,evalue,traceback,ename_mapped,imports,lib_alias,is_MLnb
0,aaronalbrecht_hardness-contest.ipynb,valueerror,The feature names should match those that were...,['\x1b[1;31m----------------------------------...,valueerror,"{('', 'numpy', 'np'), ('sklearn.model_selectio...","[['numpy', 'np'], ['sklearn', 'cross_val_score...",True
1,aaryaamoharir_resnet-50-my-version.ipynb,keyboardinterrupt,,['\x1b[0;31m----------------------------------...,keyboardinterrupt,"{('tensorflow.keras.applications.resnet50', 'R...","[['tensorflow', 'ResNet50'], ['numpy', 'np'], ...",True
2,aaryaamoharir_resnet-50-version-2.ipynb,keyboardinterrupt,,['\x1b[0;31m----------------------------------...,keyboardinterrupt,"{('tensorflow.keras.applications.resnet50', 'R...","[['tensorflow', 'ResNet50'], ['numpy', 'np'], ...",True
3,achintyabhat_activation-maximization.ipynb,typeerror,'AxesSubplot' object is not subscriptable,['\x1b[0;31m----------------------------------...,typeerror,{('tf_keras_vis.activation_maximization.input_...,"[['tf_keras_vis', 'Jitter,'], ['numpy', 'np'],...",True
4,adityabajaj03_dr-cnn.ipynb,keyerror,'val_categorical_accuracy',['\x1b[0;31m----------------------------------...,keyerror,"{('', 'warnings', ''), ('glob', 'glob', ''), (...","[['warnings', 'warnings'], ['glob', 'glob'], [...",True
...,...,...,...,...,...,...,...,...
6771,yousifadel_notebook4e3ed6988b.ipynb,valueerror,"in user code:\n\n File ""/opt/conda/lib/pyth...",['\x1b[0;31m----------------------------------...,valueerror,"{('', 'warnings', ''), ('sklearn.naive_bayes',...","[['warnings', 'warnings'], ['sklearn', 'Multin...",True
6772,yyogita_exception-handling.ipynb,valueerror,invalid literal for int() with base 10: 'R',['\x1b[0;31m----------------------------------...,valueerror,set(),[],False
6773,yyogita_exception-handling.ipynb,typeerror,unsupported operand type(s) for +: 'int' and '...,['\x1b[0;31m----------------------------------...,typeerror,set(),[],False
6774,zalyildirim_kurs-proje1.ipynb,syntaxerror,"invalid syntax (247201396.py, line 1)","['\x1b[0;36m File \x1b[0;32m""/tmp/ipykernel_2...",syntaxerror,"{('', 'numpy', 'np'), ('', 'os', ''), ('', 'pa...","[['numpy', 'np'], ['os', 'os'], ['pandas', 'pd']]",True


### 2 mark the ones with uninteresting exception types -> is_relevant

### 2.1 First, let us gather all the exception types appeared in both error dataset and select a list of uninteresting ones:

https://liuonline-my.sharepoint.com/:x:/r/personal/yirwa29_liu_se/_layouts/15/Doc.aspx?sourcedoc=%7B55EB4974-57AF-46DF-A27E-83FEBAB67B69%7D&file=nberror_exception_types.xlsx&action=default&mobileredirect=true

Write to **config.builtin_exps_excluded** when selected.

In [1]:
import pandas as pd
import util
import config

df2_err = pd.read_excel(config.path_github_error_process+'/nberror_g_all_p.xlsx')
df_err = pd.read_excel(config.path_kaggle_error_process + '/nberror_k_p.xlsx')

In [23]:
df_tmp1 = df_err[["ename", "ename_mapped"]].drop_duplicates().sort_values(by=['ename_mapped'])
df_tmp2 = df2_err[["ename", "ename_mapped"]].drop_duplicates().sort_values(by=['ename_mapped'])
df_tmp3 = pd.merge(df_tmp1, df_tmp2, on="ename", how="outer")
df_tmp3['ename_mapped'] = df_tmp3['ename_mapped_x'].fillna(df_tmp3['ename_mapped_y'])
df_tmp3 = df_tmp3[["ename", "ename_mapped"]]
df_tmp3.to_excel(config.path_default + '/nberror_exception_types.xlsx', index=False, engine="xlsxwriter")

### 2.2 process nbs from kaggle and github

In [29]:
df_err["is_relevant"] = ~df_err.ename_mapped.isin(config.builtin_exps_excluded)
df2_err["is_relevant"] = ~df2_err.ename_mapped.isin(config.builtin_exps_excluded)

In [32]:
print("There are {0:.2%} of all errors in Kaggle notebooks have relevant exception types".format(sum(df_err.is_relevant)/len(df_err)))
print("There are {0:.2%} of all errors in GitHub notebooks have relevant exception types".format(sum(df2_err.is_relevant)/len(df2_err)))

There are 52.91% of all errors in Kaggle notebooks have relevant exception types
There are 57.26% of all errors in GitHub notebooks have relevant exception types


In [33]:
df_err.to_excel(config.path_kaggle_error_process + '/nberror_k_p.xlsx', index=False, engine="xlsxwriter")
df2_err.to_excel(config.path_github_error_process+'/nberror_g_all_p.xlsx', index=False, engine="xlsxwriter")

### 3 Statistics if filtering out all the errors that are not from ML notebooks and all errors that are not relevant exceptions

In [None]:
import pandas as pd
import util
import config

df2_err = pd.read_excel(config.path_github_error_process+'/nberror_g_all_p.xlsx')
df_err = pd.read_ezxcel(config.path_kaggle_error_process + '/nberror_k_p.xlsx')

In [41]:
df2_err_filtered = df2_err[df2_err.is_MLnb&df2_err.is_relevant]
df_err_filtered = df_err[df_err.is_MLnb&df_err.is_relevant]

In [43]:
print("There are {0:.2%}({1}, {2} notebooks) of all errors in Kaggle notebooks remains".format(len(df_err_filtered)/len(df_err), len(df_err_filtered), df_err_filtered.fname.nunique()))
print("There are {0:.2%}({1}, {2} notebooks)  of all errors in GitHub notebooks remains".format(len(df2_err_filtered)/len(df2_err), len(df2_err_filtered), df2_err_filtered.fname.nunique()))

There are 47.00%(3185, 2268 notebooks) of all errors in Kaggle notebooks remains
There are 41.55%(73659, 50665 notebooks)  of all errors in GitHub notebooks remains


In [44]:
df_err.columns

Index(['fname', 'ename', 'evalue', 'traceback', 'ename_mapped', 'imports',
       'lib_alias', 'is_MLnb', 'is_relevant'],
      dtype='object')