In [1]:
import numpy as np
import pandas as pd
import pprint
from sklearn import preprocessing
import re
import pickle
import time, datetime

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score

pp = pprint.PrettyPrinter(indent=4)

## Ignore warnings
import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
np.warnings.filterwarnings('ignore')

from tld.linktypes import fine_linktype_map

In [10]:
SOURCES = ['Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 
           'JiraEcosystem', 'MariaDB', 'Mindville', 'MongoDB', 
           'Qt', 'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring', 'Apache', 'Mojang']

# 'Jira', 'RedHat', 'Sonatype', 'Apache'

In [30]:
x, y = load_data('Qt')

In [31]:
x['type'].value_counts()

Bug               106804
Suggestion         15723
Task               12015
Technical task      4830
Sub-task            4792
User Story          3401
Epic                 793
Change Request       209
Improvement           11
Research               1
Name: type, dtype: int64

In [11]:
def load_data(source):
    #Loading Issues
    filename = 'data/processed/issues_'+source.lower()+'.csv'
    issue_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';')

    #Loading Links
    filename = 'data/processed/links_plus_'+source.lower()+'.csv'
    link_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';', index_col=[0]).drop_duplicates()
    
    link_df['mappedtype'] = link_df['linktype'].map(fine_linktype_map)
    
    return issue_df, link_df

In [38]:
tfidfvect_dict = {}

In [39]:
repos = []
DTs = []
RFs = []
SVCs = []

# for s in ['Jira', 'RedHat', 'Sonatype', 'Apache']:
for s in SOURCES:
    repos.append(s)
    
    print(s.upper())
    issue_df, link_df = load_data(s)
    
    issue_df['title'].fillna(' ', inplace=True)
    issue_df['description'].fillna(' ', inplace=True)
    
    linked_issues = set(link_df['issue_id_1']).union(set(link_df['issue_id_2']))
    linked_issue_df = issue_df[issue_df['issue_id'].isin(linked_issues)]
    
    linked_issue_df['text'] = linked_issue_df['title']+ " " + linked_issue_df['description']
    
    try:
        df_tfidfvect = tfidfvect_dict[s]
    except:
        tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.01, ngram_range=(1,2))
        tfidf_wm = tfidf_vectorizer.fit_transform(list(linked_issue_df['text'].values))
        tfidf_tokens = tfidf_vectorizer.get_feature_names()
        df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = [linked_issue_df['issue_id']],columns = tfidf_tokens)
        tfidfvect_dict[s] = df_tfidfvect
    
    linktypes = (link_df.mappedtype.value_counts()>=len(link_df)*0.01).rename_axis('mappedtype').reset_index(name='valid')
    valid_types = set(linktypes[linktypes['valid']==True]['mappedtype'])

    link_df = link_df[(link_df["mappedtype"].isin(valid_types))]
    
    link_df.drop(columns=['name', 'issues', 'mappedtype'])
    
    all_data = link_df.merge(df_tfidfvect, how='inner', left_on='issue_id_1', right_on='issue_id')
    all_data = all_data.merge(df_tfidfvect, how='inner', left_on='issue_id_2', right_on='issue_id')
    
#     print(all_data['linktype'].unique())
    
    all_data['label'] = all_data['linktype'].map(fine_linktype_map)
    
#     print(all_data['label'].unique())
#     print(all_data[all_data['label'].isna()])
    
    all_data.drop(columns=['name', 'mappedtype', 'issue_id_1', 'issue_id_2', 'linktype'], inplace=True)
    
    try:
        all_data.drop(columns=['issues_x'], inplace=True)
    except:
        print("blub")
    
    y = all_data[['label']]
    
    X = all_data.drop(columns=['label'])
    
#     print(X)
    
    print("Number of unique labels: "+str(len(set(y['label']))))

    labels = list(set(y['label']))

    pp.pprint(labels)
    
    clf = DecisionTreeClassifier(random_state=0, class_weight='balanced')
    DT = np.mean(cross_val_score(clf, X, y, cv=5, scoring='f1_macro'))
    print(DT)
#     DT_perf[s] = DT
    
    clf = RandomForestClassifier(random_state=0, class_weight='balanced')
    RF = np.mean(cross_val_score(clf, X, y, cv=5, scoring='f1_macro'))
    print(RF)
#     RF_perf[s] = RF
    
    clf = LinearSVC(random_state=0, tol=1e-5, class_weight='balanced')
    SVC = np.mean(cross_val_score(clf, X, y, cv=5, scoring='f1_macro'))
    print(SVC)
#     SVC_perf[s] = SVC
    
    DTs.append(DT)
    RFs.append(RF)
    SVCs.append(SVC)

HYPERLEDGER
Number of unique labels: 7
['Relate', 'Clone', 'Non-Link', 'Epic', 'Subtask', 'Block', 'Duplicate']
0.22763241060649037
0.29273612862012277
0.27551289168331805
INTELDAOS
Number of unique labels: 8
[   'Relate',
    'Clone',
    'Subtask',
    'Non-Link',
    'Block',
    'Duplicate',
    'finish-start',
    'Verify']
0.344674709392614
0.352268218658219
0.3700802280450356
JFROG
Number of unique labels: 7
[   'Relate',
    'Depend',
    'Non-Link',
    'Subtask',
    'Trigger',
    'Incorporate',
    'Duplicate']
0.2601961274996564
0.25636893949322576
0.257806468175614
JIRA
Number of unique labels: 8
[   'Relate',
    'Detail',
    'Cause',
    'Non-Link',
    'Subtask',
    'Incorporate',
    'Duplicate',
    'Clone']
0.2687384019713899
0.2852229432116216
0.26909365372649
JIRAECOSYSTEM
Number of unique labels: 11
[   'Relate',
    'Split',
    'Depend',
    'Cause',
    'Subtask',
    'Non-Link',
    'Epic',
    'Incorporate',
    'Duplicate',
    'Block',
    'Clone']
0.151

In [None]:
baseline_dict = {
            'Repository': repos,
            'Decision Tree' : DTs,
            'Random Forest': RFs,
            'Linear SVM': SVCs,
          }

baseline_df = pd.DataFrame(baseline_dict)   
baseline_df

In [18]:
baseline_dict = {
            'Repository': repos,
            'Decision Tree' : DTs,
            'Random Forest': RFs,
            'Linear SVM': SVCs,
          }

baseline_df = pd.DataFrame(baseline_dict)   
baseline_df

Unnamed: 0,Repository,Decision Tree,Random Forest,Linear SVM
0,Hyperledger,0.23678,0.298068,0.272101
1,IntelDAOS,0.361208,0.365818,0.348201
2,JFrog,0.261597,0.250166,0.266638
3,Jira,,,
4,JiraEcosystem,0.157573,0.173077,0.170041
5,MariaDB,0.257631,0.267917,0.329494
6,Mindville,0.213849,0.163511,0.199524
7,MongoDB,0.27282,0.314926,0.248385
8,Qt,0.233372,0.267493,0.28574
9,RedHat,,,
