In [12]:
import os
import pandas as pd
import numpy as np
from os.path import join as PJOIN

In [13]:
"""
Configuration parameters to change : 
FEATURES_DIR - Folder where the sheet containing the textual and code correlation (20 features) for comments for a github project is stored
ANNOTATIONS_FILE - The extended annotations file which also has the three added columns - 'Calculated Score New', 'Matching Rules', 'Comment Type'
PROJECT_NAME - The name of the project
AUTHOR_NAME - The annotator name (unique) is appended to the annotation file 
OUTPUT_DIR - The directory where output files will be generated ( containing 20 features + annotated quality label + calculated quality label)
INTUITIVE_INDEX - Manually annotated quality score or label (PU, U, NU) from annotators 
CALCULATED_INDEX - Calculated quality score or label (PU, U, NU) based on ground truth generation rules
COMMENT_TYPE_INDEX - The column index which has triplets for multilabel classification
""" 
FEATURES_DIR = "DATA/libpng_comment_probe/"
ANNOTATIONS_FILE = "DATA/GENERATED/comments_libpng_all_marked.xlsx"
PROJECT_NAME = "libpng"
AUTHOR_NAME = "dewang"
OUTPUT_DIR = "DATA/GENERATED/TRAIN/"
OUTPUT_SUFFIX = "_cal"
OUTPUT_FILENAME = "train_" + PROJECT_NAME +"_" + AUTHOR_NAME + OUTPUT_SUFFIX + ".csv"
ANNOTATIONS = {'N':1,'P':2,'U':3,'n':1}
INTUITIVE_INDEX = 46
CALCULATED_INDEX = 47
COMMENT_TYPE_INDEX = 49

In [14]:
# Reading the csv / excel sheet
if ANNOTATIONS_FILE[-3:] == 'csv':
    annotations_file = pd.read_csv(ANNOTATIONS_FILE,delimiter='$')
else:
    annotations_file = pd.read_excel(ANNOTATIONS_FILE)
annotations_file.head()

Unnamed: 0,Filename,Comment text,Start line,End line,No. of words,Program Domain Concepts,Problem Domain Concepts,Copyright/License,Bug/Fix/Patch/Version,Build,...,C25,C26,C27,C28,C29,C30,Score,Calculated Score New,Matching Rules,Comment Type
0,repos/libpng-code/pngset.c,pngset.c - storage of image information into i...,2,17,116,"{'storage': ['storag', 'Time Complexity / Spac...","['image', 'libpng']",True,False,False,...,,,,1.0,,,U,P,2161722.0,"[1, 1, 1]"
1,repos/libpng-code/pngset.c,override with app values,62,61,4,"{'values': ['valu', 'Data-Structure and its Co...",[],False,False,False,...,,,,,,,P,P,,"[1, 0, 0]"
2,repos/libpng-code/pngset.c,FLOATING_POINT,133,133,1,{},[],False,False,False,...,,,,,,,N,N,1215.0,"[0, 0, 0]"
3,repos/libpng-code/pngset.c,cHRM,135,135,1,{},[],False,False,False,...,,,,,,,P,N,12.0,"[0, 0, 0]"
4,repos/libpng-code/pngset.c,eXIf,182,182,1,{},[],False,False,False,...,,,,,,,P,N,12.0,"[0, 0, 0]"


In [15]:
# printing the first row of the annotation sheet
annotations_np = np.array(annotations_file)
for i,el in enumerate(annotations_np[0]):
    print(i," : ",el)

0  :  repos/libpng-code/pngset.c
1  :  pngset.c - storage of image information into info struct
 *
 * Copyright (c) 2018 Cosmin Truta
 * Copyright (c) 1998-2018 Glenn Randers-Pehrson
 * Copyright (c) 1996-1997 Andreas Dilger
 * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
 *
 * This code is released under the libpng license.
 * For conditions of distribution and use, see the disclaimer
 * and license in png.h
 *
 * The functions here are used during reads to store data from the file
 * into the info struct, and during writes to store application data
 * into the info struct for writing into the file.  This abstracts the
 * info struct and allows us to change the structure in the future.
2  :  2
3  :  17
4  :  116
5  :  {'storage': ['storag', 'Time Complexity / Space Complexity/ Memory/ Exception'], 'struct': ['structur', 'Data-Structure and its Components'], 'group': ['group', 'Operations as part of Data structure'], 'for': ['for', 'Operations as part of Algorithms'], 'dis

In [16]:
# Creating a dictionary with file names as key and all data rows of that filename collected in a list as the value
annotations_map = {}
for anno in annotations_np:
    if anno[0] != anno[0]:
        print(anno)
    if anno[0] not in annotations_map:
        annotations_map[anno[0]] = []
    annotations_map[anno[0]].append(anno)

In [17]:
"""
For a github project, there will be multiple constituent C files. 
The feature sheet generated for each of the C file (named as <<c_file_train.csv>>, like pngimage.c_train.csv, pngwutil.c_train.csv). 
In this code, the constituent file names for a project is concatenated and stored in all_files
""" 
all_files = []
for file in os.listdir(FEATURES_DIR):
    if not file.endswith("train.csv"):
        continue
    if not PROJECT_NAME in file:
        continue
    fName = file[file.find(PROJECT_NAME) + 1 + len(PROJECT_NAME) : -10]
    print(fName)
    fName = fName.replace("_","/")
    print(fName)
    all_files.append(fName)

contrib_libtests_pngimage.c_pngimage.c
contrib/libtests/pngimage.c/pngimage.c


In [18]:
"""
We have two lists of files : 
comments_file : keys of annotations_map [built from annotation file]
all_files : list of files obtained from FEATURES_DIR (the outputs of COMMENT_PROBE)

For each filename in comments_file, we check if it is also present in all_files. Based on this, we construct two lists - found and not found.
In order to ensure correct checking, for some projects this cell may need to be edited so that found and not_found lists are correct

This cell also builds CODENAME_TO_COMMENTSFILENAME, which is a map from name in all_files to name in annotations_map keys

This code cleanses any discrepany in relative file path for a C file for a github project
The relative path is made the same for the annotation sheet and the feature sheet for a project
Needs to be checked and corrected manually for any new project


An example :

Suppose the paths mentioned in Annotations sheet are of the form repos/libpng-code/libpng/<path to code in libpng project>.
Suppose the code file under consideration is contrib/libtests/pngimage.c in the libpng project
Then, variable comments_file would be : repos/libpng-code/libpng/contrib/libtests/pngimage.c
On the other hand, CommentProbe would generate the train csv with the following name - libpng_contrib_libtests_pngimage.c_pngimage.c_train.csv
Therefore, the name in the all_files list would be - libpng/contrib/libtests/pngimage.c/pngimage.c

Then in the following change in the loop over annotations_map.items() 
    if comments_file.startswith("repos/libpng-code/"):
        bname = comments_file.split("/")[-1]
        fName = comments_file[len('repos/libpng-code/'):] + "_" + bname

bname would be pngimage.c
fName after replacement of _ to / would be - libpng/contrib/libtests/pngimage.c/pngimage.c
Note that the reason for adding filename again is because the output of smartKT is placed in a folder of same name as the filename, which causes path to the file's path in smartKT output folder to have the filename twice (the first is actually the name of folder, and the second is for the actual file)
For example - for smartKT output for libpng/contrib/libtests/pngimage.c
libpng/contrib/libtests/pngimage.c is the folder
libpng/contrib/libtests/pngimage.c/pngimage.c is the code file

The fName would now be present in all_files
The aim of path correction is to put a transformation for file paths mentioned in annotations sheet such that they become the paths as per all_files variable, which is inferred from the name of the train csv
"""
not_found = []
found = []
CODENAME_TO_COMMENTSFILENAME = {}
for comments_file, annos in annotations_map.items():
    print("CF:",comments_file)
    
    if comments_file != comments_file:
        continue
    if comments_file.startswith("server_mariadb_winter_annotation/"):
        fName = comments_file[len("server_mariadb_winter_annotation/"):]
    elif comments_file.startswith("Stockfish-sf_11/"):
        fName = comments_file[len("Stockfish-sf_11/"):]
    else:
        fName = comments_file[comments_file.find(PROJECT_NAME) + len(PROJECT_NAME) +1:]
    if comments_file.startswith('repos/dealii/'):
        bname = comments_file.split("/")[-1]
        fName = comments_file[len('repos/dealii/'):] + "_" + bname
    ## change for libpng
    if comments_file.startswith("repos/libpng-code/"):
        bname = comments_file.split("/")[-1]
        fName = comments_file[len('repos/libpng-code/'):] + "_" + bname
        
    print("CF2:",fName)
    #print(fName)
    fName = fName.replace("_","/")
    #print(fName)
    if fName[:7] == 'mariadb':
        fName = fName[8:]
    CODENAME_TO_COMMENTSFILENAME[fName] = comments_file
    
    print(fName)
    if fName not in all_files:
        not_found.append(fName)
    else:
        found.append(fName)
print("========Not FOund=========")
print(not_found)
print("========FOUND=======")
print(found)

CF: repos/libpng-code/pngset.c
CF2: pngset.c_pngset.c
pngset.c/pngset.c
CF: repos/libpng-code/pngwrite.c
CF2: pngwrite.c_pngwrite.c
pngwrite.c/pngwrite.c
CF: repos/libpng-code/pngtrans.c
CF2: pngtrans.c_pngtrans.c
pngtrans.c/pngtrans.c
CF: repos/libpng-code/pngwutil.c
CF2: pngwutil.c_pngwutil.c
pngwutil.c/pngwutil.c
CF: repos/libpng-code/pngerror.c
CF2: pngerror.c_pngerror.c
pngerror.c/pngerror.c
CF: repos/libpng-code/pngwtran.c
CF2: pngwtran.c_pngwtran.c
pngwtran.c/pngwtran.c
CF: repos/libpng-code/pngrtran.c
CF2: pngrtran.c_pngrtran.c
pngrtran.c/pngrtran.c
CF: repos/libpng-code/pngtest.c
CF2: pngtest.c_pngtest.c
pngtest.c/pngtest.c
CF: repos/libpng-code/png.c
CF2: png.c_png.c
png.c/png.c
CF: repos/libpng-code/pngrutil.c
CF2: pngrutil.c_pngrutil.c
pngrutil.c/pngrutil.c
CF: repos/libpng-code/pngwio.c
CF2: pngwio.c_pngwio.c
pngwio.c/pngwio.c
CF: repos/libpng-code/pngget.c
CF2: pngget.c_pngget.c
pngget.c/pngget.c
CF: repos/libpng-code/pngread.c
CF2: pngread.c_pngread.c
pngread.c/pngread.c

In [19]:
"""
Here, we read the files in found, and add the relevant information in lists X, Y and Z
In X, we put the features related information (output of COMMENT_PROBE)
In Y, we put the labels (depends on output suffix - for Ex.  _cal means calculated comment quality score, _int means manually annotated comment quality score)
In Z, we put the annotation labels
"""
X = []
Y = []
Z = []
cnf = 0
fdone = set()
for file in os.listdir(FEATURES_DIR):
    if not file.endswith("train.csv"):
        continue
    if not PROJECT_NAME in file:
        continue
    fName = file[file.find(PROJECT_NAME) + 1 + len(PROJECT_NAME) : -10]
    fName = fName.replace("_","/")
    print("fName: ",fName)
    if fName not in found:
        print("LEFT: ",fName)
        continue
    print(fName)
    if fName in fdone:
        print("#########################################WTH",fName)
        continue
    fdone.add(fName)
    anno_data = annotations_map[CODENAME_TO_COMMENTSFILENAME[fName]]
    features_file = pd.read_csv(PJOIN(FEATURES_DIR,file),header=None,encoding="ISO-8859–1")
    features_np = np.array(features_file)
    features_map = {}
    for feat in features_np:
#         if len(feat[2:]) > 12:
#             print("############")
#             print(feat)
        features_map[feat[1]] = feat[2:22]
    for comments_data in anno_data:
        if comments_data[1] not in features_map:
            cnf += 1
            print("NF:",comments_data[1])
            continue
        features = features_map[comments_data[1]]
        labels_intuitive = comments_data[INTUITIVE_INDEX]
        labels_calculated = comments_data[CALCULATED_INDEX]
        labels_commentType = comments_data[COMMENT_TYPE_INDEX]
        if OUTPUT_SUFFIX == '_cal':
            label_used = labels_calculated
            label_compared = labels_calculated
        elif OUTPUT_SUFFIX == '_int':
            label_used = labels_intuitive
            label_compared = labels_intuitive
        elif OUTPUT_SUFFIX == '_calint':
            label_used = labels_calculated
            label_compared = labels_intuitive
        elif OUTPUT_SUFFIX == '_commentType':
            label_used = labels_commentType
            label_compared = labels_intuitive
        if label_compared != label_compared:
            continue
        X.append(features)
        Z.append(comments_data)
        if OUTPUT_SUFFIX == '_commentType':
            Y.append(label_used)
        else:
            Y.append(ANNOTATIONS[label_used[0]])

fName:  contrib/libtests/pngimage.c/pngimage.c
contrib/libtests/pngimage.c/pngimage.c
NF: pngimage.c
 *
 * Copyright (c) 2015,2016 John Cunningham Bowler
 *
 * Last changed in libpng 1.6.24 [August 4, 2016]
 *
 * This code is released under the libpng license.
 * For conditions of distribution and use, see the disclaimer
 * and license in png.h
 *
 * Test the png_read_png and png_write_png interfaces.  Given a PNG file load it
 * using png_read_png and then write with png_write_png.  Test all possible
 * transforms.
NF: Relicate twice


In [21]:
len(X), len(Y), cnf

(218, 218, 2)

In [22]:
# Here we write the data from lists X, Y and Z to different output files
import csv
with open(PJOIN(OUTPUT_DIR,"X_"+OUTPUT_FILENAME),'w') as f:
    writer = csv.writer(f)
    for x in X:
        writer.writerow(x)

with open(PJOIN(OUTPUT_DIR,"Y_"+OUTPUT_FILENAME),'w') as f:
    writer = csv.writer(f)
    for y in Y:
        writer.writerow([y])
    
with open(PJOIN(OUTPUT_DIR,"Z_"+OUTPUT_FILENAME),'w') as f:
    writer = csv.writer(f, delimiter='\t')
    for z in Z:
        writer.writerow(z)
