In [1]:
import os
import xml.etree.ElementTree as elemTree
import json

In [2]:
def get_br(gp):
    path = f"{fpath}/{gp}"
    brs = os.listdir(path)
    return brs

# BR 정보, SF (commit)정보
def openxml(xmldir):
    #print(xmldir)
    tree = elemTree.parse(xmldir)
    root = tree.getroot()
    assert len(root) == 2
    bug = root[0]
    commit = root[1]
    return bug,commit

def getbugtext(bug):
    summary = bug.find('summary').text
    description = bug.find('description').text
    return summary, description

def getbuginfo(bug):
    bid = bug.attrib['id']
    author = bug.attrib['author']
    openT = bug.attrib['open_date']
    closeT = bug.attrib['closed_time']
    return bid, author, openT, closeT

def getCtext(comment):
    return comment.text

def getCinfo(comment):
    Cid = int(comment.attrib['id'])
    Cauthor = comment.attrib['author']
    Cdate = comment.attrib['date']
    return Cid, Cauthor, Cdate

# return dictionary
def getCdict(bug):
    comments = list(bug.find('comments'))
    Cdict = {int(C.attrib['id']):{} for C in comments}
    for C in comments:
        Ctext = getCtext(C)
        Cid, Cauthor, Cdate = getCinfo(C)
        Cdict[Cid]['text'] = Ctext
        Cdict[Cid]['author'] = Cauthor
        Cdict[Cid]['date'] = Cdate
    return Cdict


In [3]:
# commits
def get_dmm(commit):
    dmm = commit.find('dmm_unit')
    NLOC = dmm.attrib['size']
    CCN = dmm.attrib['complexity']
    NPRAM = dmm.attrib['interfacing']
    return NLOC, CCN, NPRAM

def get_commitinfo(commit):
    commitID = commit.attrib['id']
    committer=commit.attrib['author']
    commitDate=commit.attrib['date']
    ccomplexity = commit.find('dmm_unit')
    dmm = commit.find('dmm_unit')
    cNLOC = dmm.attrib['size']
    cCCN = dmm.attrib['complexity']
    cNPRAM = dmm.attrib['interfacing']
    return commitID, committer, commitDate, cNLOC, cCCN, cNPRAM
    
def get_files(commit):
    files = [x for x in list(commit) if x.tag!='dmm_unit' ]
    return files

#fixed files
def get_fileinfo(file):
    type_=file.attrib['change_type']
    Ofname=file.attrib['old_name']
    Nfname=file.attrib['new_name']
    fcomplexity = file.find('file_info')
    fNLOC = fcomplexity.attrib['nloc']
    fCCN = fcomplexity.attrib['complexity']
    fNTOKEN = fcomplexity.attrib['token_count']
    # modified method 수
    children = [x.tag for x in list(file)]
    children = [x for x in children if x=='method']
    Nmm = len(children)
    return type_, Ofname, Nfname, fNLOC,fCCN,fNTOKEN, Nmm

#method 아닌거
def get_Nmethodinfo(file):
    aline=file.find('added_lines').text
    dline=file.find('deleted_lines').text

# method
def get_methodinfo(method):
    mName = method.attrib['name']
    mParams = method.attrib['parameters']
    aline = method.find('added_lines').text
    dline = method.find('deleted_lines').text
    return mName, mParams, aline, dline

def get_methodattribs(method):
    infos = method.find('method_info')
    nloc = infos.attrib['nloc']
    complexity=infos.attrib['complexity']
    toks=infos.attrib['token_count']
    nlevel=infos.attrib['nesting_level']
    sline=infos.attrib['start_line']
    eline = infos.attrib['end_line']
    return nloc, complexity, toks, nlevel, sline, eline

In [4]:
fpath = "./Generator/data/final_dataset/full/"
gps = os.listdir(fpath)
print(len(gps),'projects')

BRs = get_br(gps[0])
print(len(BRs), BRs[0])

147 projects
3 229.xml


In [6]:
# Complex version
for GroupProject in gps:
    BRs = get_br(GroupProject)
    print(f'[{gps.index(GroupProject)}/{len(gps)}]',GroupProject, len(BRs))
    for BR in BRs:
        print(GroupProject, BR)
        finaljson = {'BR':{},'commit':{}}
        
        xmldir = f"{fpath}/{GroupProject}/{BR}"
        bug,commit = openxml(xmldir)
        # BR
        summary, description = getbugtext(bug)
        bid, author, openT, closeT = getbuginfo(bug)
        finaljson['BR']={'BR_id':bid,
                         'BR_author':author,
                         'BRopenT':openT,
                         'BRcloseT':closeT,
                         'BR_text':{'BRsummary':summary,
                                    'BRdescription':description
                                   },
                         'comments':{}
                        }

        #Comment
        comments = list(bug.find('comments'))
        cdict = {}
        for i,C in enumerate(comments):
            Ctext = getCtext(C)
            Cid, Cauthor, Cdate = getCinfo(C)

            key = f'comments_{i}'
            dict_ = {'comment_id':Cid,
                     'comment_author':Cauthor,
                     'commentT':Cdate,
                     'comment_text':Ctext
                    }
            
            cdict[key] = dict_
        finaljson['BR']['comments'] = cdict
            
              
        # commit
        commitID, committer, commitDate, cNLOC, cCCN, cNPRAM = get_commitinfo(commit)
        finaljson['commit']={'commit_id':commitID,
                             'commit_author':committer,
                             'commitT':commitDate,
                             'commit_complexity':{'commit_NLOC':cNLOC,
                                                  'commit_CCN':cCCN,
                                                  'commit_Nprams':cNPRAM
                                                 },
                             'changed_files':{}
                            }

        
        #file별로
        files = get_files(commit)
        fdict={}
        for i,file in enumerate(files):
            type_, Ofname, Nfname,fNLOC,fCCN,fNTOKEN,Nmm = get_fileinfo(file)
            key = f'file_{i}'
            fdict[key]={'file_change_type':type_,
                        'file_Nmethod':Nmm,
                        'file_old_name':Ofname,
                        'file_new_name':Nfname,
                        'file_complexity':{'file_NLOC':fNLOC,
                                           'file_CCN':fCCN,
                                           'file_NToken':fNTOKEN
                                          }
                       }

            
            if type_=='ADD' or type_=='DELETE' or type_=='RENAME':
                continue      
            
            # MODIFIED
            fdict[key]['hunks']={}
            if Nmm==0:
                hkey = f'hunk_0'
                modified_lines = file.find('modified_lines')
                aline=modified_lines.find('added_lines').text
                dline=modified_lines.find('deleted_lines').text
                fdict[key]['hunks'][hkey]={'Ismethod':0,
                                           'added_lines':aline,
                                           'deleted_lines':dline
                                          }
            else:
                methods = file.findall('method')
                for k,method in enumerate(methods):
                    hkey=f'hunk_{k}'
                    mName, mParams, aline, dline = get_methodinfo(method)
                    nloc, complexity, toks, nlevel, sline, eline = get_methodattribs(method)
                    fdict[key]['hunks'][hkey]={'Ismethod':1,
                                               'added_lines':aline,
                                               'deleted_lines':dline,
                                               'method_info':{'method_name':mName,
                                                              'method_params':mParams,
                                                              'method_startline':sline,
                                                              'method_endline':eline,
                                                              'method_complexity':{'method_NLOC':nloc,
                                                                                   'method_CCN':complexity,
                                                                                   'method_NToken':toks,
                                                                                   'method_nesting_level':nlevel
                                                                                  }
                                                             }
                                              }
            
        finaljson['commit']['changed_files']=fdict
        
        jsonpath = f'./JSonSet/fulljson/{GroupProject}/'
        if not os.path.isdir(jsonpath):
            os.makedirs(jsonpath)
        fname = BR.replace('.xml','.json')
        jsonpath = f'./JSonSet/fulljson/{GroupProject}/{fname}'
        with open(jsonpath,'w') as jf:
            json.dump(finaljson,jf)


[0/147] 1adrianb+face-alignment 3
229.xml
241.xml
243.xml
[1/147] aleju+imgaug 2
646.xml
669.xml
[2/147] alexemg+deeplabcut 1
1018.xml
[3/147] allenai+allennlp 33
292.xml
3426.xml
3465.xml
4255.xml
4281.xml
4318.xml
4319.xml
4330.xml
4357.xml
4360.xml
4378.xml
4393.xml
4427.xml
4428.xml
4494.xml
4501.xml
4504.xml
4567.xml
4612.xml
4646.xml
4649.xml
4653.xml
4666.xml
4715.xml
4739.xml
4750.xml
4757.xml
4810.xml
4819.xml
4825.xml
4839.xml
572.xml
610.xml
[4/147] apache+incubator-mxnet 250
10011.xml
10026.xml
10037.xml
10224.xml
10235.xml
10431.xml
10438.xml
10520.xml
10738.xml
10858.xml
10866.xml
10868.xml
10901.xml
11057.xml
11060.xml
11077.xml
11084.xml
11108.xml
11115.xml
11160.xml
11241.xml
11271.xml
11331.xml
11339.xml
11352.xml
11353.xml
11411.xml
11430.xml
11504.xml
11599.xml
11639.xml
11841.xml
11849.xml
11961.xml
12024.xml
12046.xml
12061.xml
12087.xml
12179.xml
12404.xml
12431.xml
12473.xml
12524.xml
12528.xml
12613.xml
12627.xml
12662.xml
12778.xml
12783.xml
12787.xml
12868.xm

923.xml
924.xml
928.xml
929.xml
93.xml
934.xml
942.xml
948.xml
950.xml
952.xml
961.xml
983.xml
986.xml
991.xml
992.xml
994.xml
997.xml
[24/147] deepmind+dm_control 6
123.xml
149.xml
158.xml
162.xml
34.xml
51.xml
[25/147] deepset-ai+farm 44
113.xml
148.xml
193.xml
21.xml
222.xml
229.xml
235.xml
238.xml
251.xml
261.xml
272.xml
298.xml
307.xml
310.xml
312.xml
316.xml
326.xml
359.xml
362.xml
366.xml
381.xml
39.xml
423.xml
436.xml
452.xml
454.xml
457.xml
462.xml
47.xml
485.xml
49.xml
501.xml
519.xml
520.xml
525.xml
533.xml
551.xml
553.xml
558.xml
602.xml
608.xml
611.xml
70.xml
71.xml
[26/147] denizyuret+autograd.jl 4
101.xml
34.xml
80.xml
87.xml
[27/147] developmentseed+label-maker 1
138.xml
[28/147] dmlc+dgl 35
1018.xml
1036.xml
1046.xml
1081.xml
1264.xml
1288.xml
1356.xml
1409.xml
1421.xml
1563.xml
1641.xml
1754.xml
1837.xml
2087.xml
2098.xml
2118.xml
2128.xml
2157.xml
2161.xml
2166.xml
2175.xml
2409.xml
2424.xml
2473.xml
2483.xml
2484.xml
2500.xml
2528.xml
354.xml
412.xml
438.xml
538.xml

AttributeError: 'NoneType' object has no attribute 'find'

In [15]:
### simple version ######
for GroupProject in gps:
    BRs = get_br(GroupProject)
    print(f'[{gps.index(GroupProject)}/{len(gps)}]',GroupProject, len(BRs))
    for BR in BRs:
        finaljson = {'BR':{},'commit':{}}
        
        xmldir = f"{fpath}/{GroupProject}/{BR}"
        bug,commit = openxml(xmldir)
        # BR
        summary, description = getbugtext(bug)
        bid, author, openT, closeT = getbuginfo(bug)
        finaljson['BR']={'BR_id':bid,
                         'BR_author':author,
                         'BRopenT':openT,
                         'BRcloseT':closeT,
                         'BR_text':{'BRsummary':summary,
                                    'BRdescription':description
                                   },
                         'comments':{}
                        }

        #Comment
        comments = list(bug.find('comments'))
        cdict = {}
        for i,C in enumerate(comments):
            Ctext = getCtext(C)
            Cid, Cauthor, Cdate = getCinfo(C)

            key = f'comments_{i}'
            dict_ = {'comment_id':Cid,
                     'comment_author':Cauthor,
                     'commentT':Cdate,
                     'comment_text':Ctext
                    }
            
            cdict[key] = dict_
        finaljson['BR']['comments'] = cdict
            
              
        # commit
        commitID, committer, commitDate, cNLOC, cCCN, cNPRAM = get_commitinfo(commit)
        finaljson['commit']={'commit_id':commitID,
                             'commit_author':committer,
                             'commitT':commitDate,
                             'changed_files':{}
                            }

        
        #file별로
        files = get_files(commit)
        fdict={}
        for i,file in enumerate(files):
            type_, Ofname, Nfname,fNLOC,fCCN,fNTOKEN,Nmm = get_fileinfo(file)
            key = f'file_{i}'
            fdict[key]={'file_change_type':type_,
                        'file_Nmethod':Nmm,
                        'file_old_name':Ofname,
                        'file_new_name':Nfname
                       }

            
            if type_=='ADD' or type_=='DELETE' or type_=='RENAME':
                continue      
            
            # MODIFIED
            fdict[key]['hunks']={}
            if Nmm==0:
                hkey = f'hunk_0'
                modified_lines = file.find('modified_lines')
                aline=modified_lines.find('added_lines').text
                dline=modified_lines.find('deleted_lines').text
                fdict[key]['hunks'][hkey]={'Ismethod':0,
                                           'added_lines':aline,
                                           'deleted_lines':dline
                                          }
            else:
                methods = file.findall('method')
                for k,method in enumerate(methods):
                    hkey=f'hunk_{k}'
                    mName, mParams, aline, dline = get_methodinfo(method)
                    nloc, complexity, toks, nlevel, sline, eline = get_methodattribs(method)
                    fdict[key]['hunks'][hkey]={'Ismethod':1,
                                               'added_lines':aline,
                                               'deleted_lines':dline,
                                               'method_info':{'method_name':mName,
                                                              'method_params':mParams,
                                                              'method_startline':sline,
                                                              'method_endline':eline
                                                             }
                                              }
            
        finaljson['commit']['changed_files']=fdict
        
        jsonpath = f'./JSonSet/simplejson/{GroupProject}/'
        if not os.path.isdir(jsonpath):
            os.makedirs(jsonpath)
        fname = BR.replace('.xml','.json')
        jsonpath = f'./JSonSet/simplejson/{GroupProject}/{fname}'
        with open(jsonpath,'w') as jf:
            json.dump(finaljson,jf)

[0/53] allenai+allennlp 32
[1/53] alluxio+alluxio 32
[2/53] apache+incubator-mxnet 250
[3/53] apple+turicreate 196
[4/53] awslabs+gluon-ts 42
[5/53] bvlc+caffe 1
[6/53] carla-simulator+carla 20
[7/53] chainer+chainer 7
[8/53] cmu-perceptual-computing-lab+openpose 11
[9/53] cornellius-gp+gpytorch 13
[10/53] deepset-ai+farm 44
[11/53] dmlc+gluon-nlp 31
[12/53] eclipse+deeplearning4j 51
[13/53] espnet+espnet 8
[14/53] evilsocket+pwnagotchi 16
[15/53] explosion+spacy 174
[16/53] flairnlp+flair 6
[17/53] heartexlabs+label-studio 6
[18/53] horovod+horovod 48
[19/53] intel-isl+open3d 45
[20/53] jina-ai+jina 13
[21/53] keras-team+autokeras 23
[22/53] kubeflow+kubeflow 81
[23/53] microsoft+nni 50
[24/53] microsoft+onnxruntime 58
[25/53] microsoft+pai 45
[26/53] microsoft+recommenders 5
[27/53] mindsdb+mindsdb 9
[28/53] mlpack+mlpack 13
[29/53] mozilla+deepspeech 24
[30/53] nvidia+digits 15
[31/53] oneapi-src+onednn 42
[32/53] opencv+opencv 155
[33/53] openmined+pysyft 33
[34/53] openvinotoolkit