In [21]:
from pymongo import MongoClient

connection = MongoClient(
    host="research.cassee.dev",
    username="read-shark",
    password="msr2021shark")

db = connection.smartshark_2_1


'''
We connect to the database and run the following query. This querys selects
the issue_type and issue_type_verified of all issues for which issue_type_verified exists.

This ensures that we only consider issues for which issue_type_verified has been set
'''
issues = list(db.issue.find({'issue_type_verified': {'$exists': True }}, {'issue_type': 1, 'issue_type_verified' : 1}))

print(f"There are {len(issues)} issues")


SyntaxError: EOF while scanning triple-quoted string literal (2731736867.py, line 17)

In [12]:
import pandas as pd

'''
We transform the obtained issues into a matrix, where each row is an issue,
and the first column is the issue_type, the second the issue_type_verified
'''
rows = [[issue["issue_type"], issue["issue_type_verified"]] for issue in issues]


'''
We convert this matrix into a pandas dataframe, for easy data processing
'''
df = pd.DataFrame(rows, columns = ["issue_type", "verified"])


'''
We use the apply function to capitalize the first letter of each value 
in issue_type_verified

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
'''
df['verified'] = df['verified'].apply(lambda val: val.capitalize())

df.head()

Unnamed: 0,issue_type,verified
0,Bug,Bug
1,Bug,Bug
2,Bug,Improvement
3,Bug,Bug
4,Bug,Improvement


In [13]:
df.issue_type.value_counts()

Bug               13865
Improvement        1219
New Feature         183
Task                117
Sub-task             90
Test                 21
Wish                 21
Technical task        1
Name: issue_type, dtype: int64

In [14]:
df.verified.value_counts()

Bug                8020
Improvement        4345
Other              1227
Documentation       774
Feature_request     549
Test                510
Refactoring          92
Name: verified, dtype: int64

In [15]:
pd.crosstab(df["issue_type"], df["verified"])

verified,Bug,Documentation,Feature_request,Improvement,Other,Refactoring,Test
issue_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bug,7974,678,53,3757,904,8,491
Improvement,39,71,278,520,226,77,8
New Feature,1,3,160,6,11,0,2
Sub-task,2,2,52,16,14,2,2
Task,2,17,2,38,47,4,7
Technical task,0,0,0,1,0,0,0
Test,0,1,0,2,18,0,0
Wish,2,2,4,5,7,1,0


In [16]:
'''
Again we use the apply function to transform a column
'''

df['verified'] = df['verified'].apply(lambda val: "New Feature" if val == "Feature_request" else val) 

In [17]:
'''
Now that we merged two categories, we can reinspect the crosstab
'''

pd.crosstab(df["issue_type"], df["verified"])

verified,Bug,Documentation,Improvement,New Feature,Other,Refactoring,Test
issue_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bug,7974,678,3757,53,904,8,491
Improvement,39,71,520,278,226,77,8
New Feature,1,3,6,160,11,0,2
Sub-task,2,2,16,52,14,2,2
Task,2,17,38,2,47,4,7
Technical task,0,0,1,0,0,0,0
Test,0,1,2,0,18,0,0
Wish,2,2,5,4,7,1,0


In [19]:
from sklearn.metrics import cohen_kappa_score

'''
We use cohen kappa from scikit learn to actually investigate the agreement between the two columns

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html
'''


cohen_kappa_score(df["issue_type"], df["verified"])

0.17356791456542675

In [20]:
'''
We can also investigate what would happen if we only consider the few large labels.

However, this does not influence agreement much, and we would have to justify our 
choice for only selection these values
'''

cohen_kappa_score(df["issue_type"], df["verified"], labels=["Bug", "Improvement", "New Feature", "Test"])

0.17356791456542675