# MLCQ Dataset Preprocessing

In [1]:
import numpy as np 
import pandas as pd
import os

def sevrity_to_numeric(severity):
    if severity == 'none':
        return 0
    elif severity == 'minor':
        return 1
    elif severity == 'major':
        return 2
    elif severity == 'critical':
        return 3
    else:
        print(f'sev: {severity}')
        raise Exception 
def combine_severities(df, sample):
    rows = df[df.sample_id == sample]
    return [i for i in rows.severity.array]

In [2]:
mlcq = pd.read_csv('mlcq.csv').sort_values(by="sample_id")
samples = mlcq.drop_duplicates("sample_id").sample_id
# severity 제외 공통된 부분만 픽
mlcq = mlcq.drop(["reviewer_id","review_timestamp"],axis=1)
mlcq.severity = mlcq.severity.map(lambda s: sevrity_to_numeric(s))
severities = samples.map(lambda sample: combine_severities(mlcq, sample))

mlcq["severities"] = pd.Series(severities)
mlcq = mlcq.drop(["severity"], axis=1)
mlcq = mlcq.drop_duplicates(["sample_id"])
mlcq["avg"] = mlcq.severities.map(lambda ss: np.mean(ss))
mlcq

Unnamed: 0,id,sample_id,smell,type,code_name,repository,commit_hash,path,start_line,end_line,link,is_from_industry_relevant_project,severities,avg
1364,1902,3698323,data class,class,com.amazon.ask.dispatcher.request.handler.impl...,git@github.com:alexa/alexa-skills-kit-sdk-for-...,bf1e9ccc50d1f3f8408f887f70197ee288fd4bd9,/ask-sdk-core/src/com/amazon/ask/dispatcher/re...,26,59,https://github.com/alexa/alexa-skills-kit-sdk-...,1.0,"[0, 0]",0.000000
1357,1895,3698602,feature envy,function,com.amazon.ask.request.mapper.impl.BaseRequest...,git@github.com:alexa/alexa-skills-kit-sdk-for-...,bf1e9ccc50d1f3f8408f887f70197ee288fd4bd9,/ask-sdk-runtime/src/com/amazon/ask/request/ma...,79,81,https://github.com/alexa/alexa-skills-kit-sdk-...,1.0,"[0, 0]",0.000000
1359,1897,3698665,feature envy,function,com.amazon.ask.builder.impl.AbstractSkillBuild...,git@github.com:alexa/alexa-skills-kit-sdk-for-...,bf1e9ccc50d1f3f8408f887f70197ee288fd4bd9,/ask-sdk-runtime/src/com/amazon/ask/builder/im...,91,94,https://github.com/alexa/alexa-skills-kit-sdk-...,1.0,"[0, 0]",0.000000
1355,1893,3698860,long method,function,com.alibaba.android.arouter.facade.model.Route...,git@github.com:alibaba/ARouter.git,93b328569bbdbf75e4aa87f0ecf48c69600591b2,/arouter-annotation/src/main/java/com/alibaba/...,200,213,https://github.com/alibaba/ARouter/blob/93b328...,1.0,"[0, 0]",0.000000
1362,1900,3699227,long method,function,com.alibaba.android.arouter.facade.enums.#getC...,git@github.com:alibaba/ARouter.git,93b328569bbdbf75e4aa87f0ecf48c69600591b2,/arouter-annotation/src/main/java/com/alibaba/...,32,34,https://github.com/alibaba/ARouter/blob/93b328...,1.0,"[0, 0]",0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7110,7689,9540354,blob,class,org.apache.hadoop.yarn.server.nodemanager.cont...,git@github.com:apache/hadoop.git,128dd91e10080bdcbcd7d555fa3c4105e55a6b51,/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-s...,24,39,https://github.com/apache/hadoop/blob/128dd91e...,1.0,"[0, 0]",0.000000
10206,10808,9542671,data class,class,org.apache.hadoop.yarn.webapp.hamlet2.HamletSp...,git@github.com:apache/hadoop.git,128dd91e10080bdcbcd7d555fa3c4105e55a6b51,/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-c...,1772,1820,https://github.com/apache/hadoop/blob/128dd91e...,1.0,"[0, 0, 0, 2, 0, 0]",0.333333
4781,5352,9542718,feature envy,function,org.apache.hadoop.mapred.NotRunningJob#killTas...,git@github.com:apache/hadoop.git,128dd91e10080bdcbcd7d555fa3c4105e55a6b51,/hadoop-mapreduce-project/hadoop-mapreduce-cli...,213,219,https://github.com/apache/hadoop/blob/128dd91e...,1.0,"[0, 0]",0.000000
9135,9733,9548458,blob,class,org.apache.hadoop.metrics2.sink.ganglia.Gangli...,git@github.com:apache/hadoop.git,128dd91e10080bdcbcd7d555fa3c4105e55a6b51,/hadoop-common-project/hadoop-common/src/main/...,46,253,https://github.com/apache/hadoop/blob/128dd91e...,1.0,"[0, 0]",0.000000


## LM 분포 비율 by Industry-Relevance

In [3]:
def print_ratio(lnum, ldom, msg=''):
    print(f'{msg}{len(lnum) / len(ldom) * 100:2.3}% ({len(lnum)}/{len(ldom)})')

In [4]:
# Delete redundunt columns
data = mlcq[mlcq.smell == "long method"]
data = data.drop(["id","sample_id","smell", "type"],axis=1)
print_ratio(data[data.avg > 0], data, 'long methods : ')
ir = data[data.is_from_industry_relevant_project == 1.0]
print_ratio(ir[ir.avg > 0],ir, "IR=1.0 : ")
semi_ir = data[data.is_from_industry_relevant_project == 0.5]
print_ratio(semi_ir[semi_ir.avg > 0],semi_ir, "IR=0.5 : ")
non_ir = data[data.is_from_industry_relevant_project == 0.0]
print_ratio(non_ir[non_ir.avg > 0],non_ir, "IR=0.0 : ")


long methods : 19.5% (245/1258)
IR=1.0 : 19.5% (213/1093)
IR=0.5 : 20.3% (16/79)
IR=0.0 : 18.6% (16/86)


In [5]:
data.index = [i for i in range(len(data))]
# data["file"] = pd.Series(["" for i in range(len(data))])
# data.head()

In [6]:
def raw_url(url):
    return url.replace('//github.com/', '//raw.githubusercontent.com/').replace('/blob/', '/').split('/#')[0]

In [7]:

import wget
files = []
errors = []
for i in range(len(data)):
    try:
        
        fname = wget.download(raw_url(data.iloc[i].link), out="data")
        files.append(fname)
    except:
        print(i)
        errors.append(i)
        files.append("ERROR")
data["file"] = pd.Series(files)
data.head()

80
83
419
427
452
600
601
861
955
956
957


NameError: name 'fnames' is not defined

In [16]:
data.to_csv("data.csv")

In [26]:
dt = pd.read_csv("data.csv", index_col=0)

In [28]:
print(len(dt))
dt = dt[dt.file != "ERROR"]
print(len(dt))

1258
1247


In [29]:
dt.to_csv("no_errors.csv")

In [3]:
import pandas as pd 
import numpy as np
dt = pd.read_csv("no_errors.csv")

In [25]:
len(dt)

1247

In [22]:
pmd = pd.read_csv("pmd.csv")
pmd = pmd.drop(["Priority", "Description", "Rule set", "Rule"],axis=1)
pmd.columns = ['Problem', 'Package', 'file', 'Line']
pmd.file = pmd.file.map(lambda f: f[8:])
pmd.file = pmd.file.map(lambda f: f.replace('\\', '/'))
pmd

Unnamed: 0,Problem,Package,file,Line
0,1,org.apache.royale.compiler.clients,data/ASC.java,756
1,2,org.apache.royale.compiler.clients,data/ASC.java,1291
2,1,org.apache.tajo.org.objectweb.asm.util,data/ASMifier.java,822
3,1,org.apache.maven.plugins.javadoc,data/AbstractJavadocMojo.java,1951
4,2,org.apache.maven.plugins.javadoc,data/AbstractJavadocMojo.java,3622
...,...,...,...,...
489,1,com.sun.org.apache.xerces.internal.impl.xs,data/XMLSchemaLoader.java,759
490,2,com.sun.org.apache.xerces.internal.impl.xs,data/XMLSchemaLoader.java,1000
491,1,org.apache.ode.bpel.elang.xquery10.compiler,data/XQuery10ExpressionCompilerImpl.java,147
492,1,org.eclipse.xtext.xbase.validation,data/XbaseValidator.java,630


In [31]:
joined = pd.merge(dt, pmd, on="file", how="left")
len(joined[joined.start_line == joined.Line]) / len(joined) * 100

0.6422607578676942