In [1]:
import pandas as pd
import numpy as np
from os.path import join as PJOIN
import os

In [2]:
"""
Configuration parameters to be changed - 
PROJECT_NAME - the name of project
FILE_PATH - Path to the annotated comment sheet for the project. There will be multiple annotation sheets for a Github project.
ANNOTATION_CLASS_START - The column index number (indexing from 0) from which the annotation labels + annotated comment quality score (PU, U, NU) start 
ANNOTATION_CLASS_NUM - Number of annotation labels + annotated comment quality score (PU, U, NU)
OUTPUT_FILE_PATH - The output file corresponding to an annotation sheet, which is the concatenated version for 
                   annotation labels (calculated using ground truth generation rules) 
                   + annotated comment quality score (PU, U, NU) ( from annotators) + the annotation labels, 
                   is stored in the following path DATA/GENERATED 
"""
PREFIX = "comments"
PROJECT_NAME = "libpng"
SUFFIX = "all_marked"
FILE_PATH = "DATA/ANNOTATED/comments_libpng_all_marked.xlsx"
OUTPUT_FILE_PATH = PJOIN("DATA","GENERATED",os.path.basename(FILE_PATH))
MAP = {'U':'U', 'PU':'P', 'NU':'N'}
THRESHOLD = 10
ANNOTATION_CLASS_START = 16
ANNOTATION_CLASS_NUM = 30

In [3]:
"""
lambda functions for different rules for classification of comments to 
U - useful, PU - partially useful, NU - not useful ( which are provided by consulting with developers in the company based studies)
Input to these lambda functions is a row of annotation sheet with only the comment and annotation labels,
The rules are related to the values annotated for the labels (boolean in nature)
Along with the lambda function, the label and rule number are stored as triplets in 'rules' list
The length of c would be 31 (comment and 30 annotation labels, C1 to C30, although only 27 used in rules, rest are redundant) 
""" 
rules = []

rules.append((lambda c:(c[9] or c[11]) and not (c[1] or c[2] or c[4] or c[5]),'U',1))
rules.append((lambda c:(c[9] or c[11]) and (c[4] or c[5]),'PU',2))
rules.append((lambda c:(c[9] or c[11]) and (c[1] or c[2]), 'NU',3))
rules.append((lambda c:c[10] and not (c[1] or c[2] or c[4] or c[5]) and c[15],'U',4))
rules.append((lambda c:c[10] and not (c[1] or c[2] or c[4] or c[5]) and c[14],'PU',5))
rules.append((lambda c:c[8] and not (c[7] or c[10] or c[11]) and not (c[1] or c[2] or c[4] or c[5]) and c[15],'PU',6))
rules.append((lambda c:c[8] and not (c[7] or c[10] or c[11]) and not (c[1] or c[2] or c[4] or c[5]) and c[14],'NU',7))
rules.append((lambda c:c[10] and (c[1] or c[2]),'NU',8))
rules.append((lambda c:c[10] and  (c[4] or c[5]) and c[15],'PU',9))
rules.append((lambda c:(c[10] or c[8]) and (c[4] or c[5]) and c[14],'NU',10))
rules.append((lambda c:c[8] and  (c[1] or c[2] or c[4] or c[5]),'NU',11))
rules.append((lambda c:c[6] and c[7],'NU',12))
rules.append((lambda c:c[10] and c[9] and  (c[4] or c[5]) and c[14],'PU',13))
rules.append((lambda c:c[12],'NU',14))
rules.append((lambda c:c[1] or c[2],'NU',15))

rules.append((lambda c:c[16] or c[17],'PU',16))
rules.append((lambda c:c[18] or c[19] or c[21],'PU',17))
rules.append((lambda c:c[20],'U',18))
rules.append((lambda c:c[22],'U',19))
rules.append((lambda c:c[23] or c[24],'PU',20))
rules.append((lambda c:c[25] or c[29] or c[26],'U',21))
rules.append((lambda c:c[27] or c[28],'PU',22))
rules.append((lambda c:c[30],'NU',23))

# Rules

In [4]:
# c is a vector of size 31 [comment text, C1, C2, ......., C30]
CTS0CT = 0       # this variable was used for debugging but not used anymore
"""
This function takes a row (c), and applies all rules and stores the class and rule number of the rules which apply (i.e. return True)
"""
def get_label_all(c): #FOR ANALYSIS ONLY
    labels = []
    for rule in rules:
        if rule[0](c):
            labels.append([rule[1],rule[2]])
    return labels

# Information content - any Useful -> useful
#                     - only NU - Later PU -> or NU -> NU
#                               - Later U -> U
# Information content - atlet 1 PU - Later atleat 1 PU -> PU
#                                  - Later atleast 1 U -> U

"""
This function takes a comment row and returns only the rule numbers which apply (rule numbers returned as string)
"""
def get_matching_rules(c):
    res = []
    for i in range(23): # since there are 23 rules
        if rules[i][0](c):
            res.append(rules[i][2])
    res = [str(x) for x in res]
    return res


"""
This function applies first 15 rules, and stores the count of classes U, PU and NU in cts for rules which return True
Then, it applies remaining rules and store the counts in cts_late
Then, it follows specific conditions on old and new rule count to determine the actual class.
The rule application process is applied in two steps. 
The first 15 rules are based on a specific set of annotated labels (count of matched rules in cts)
and the next set of rules contains the remaining labels (count of matched rules in cts_late). 
Finally the last decision step is based on higher level rules which is based on the count of rules matched 
to arrive at the final quality label - U, PU, NU 
"""
def get_label(c):
    global CTS0CT
    # determining count for first 15 rules
    cts = {'U':0,'PU':0,'NU':0}
    for i in range(15):
        if rules[i][0](c):
            cts[rules[i][1]] += 1
    
    # determining count for remaining rules
    cts_late = {'U':0,'PU':0,'NU':0}
    for i in range(15, len(rules)):
        if rules[i][0](c):
            cts_late[rules[i][1]] += 1
        
    # conditions to determine the final class 
    if cts['U'] > 0:
        return 'U'
    if cts['PU'] > 0 or cts['NU'] == 0:
        if cts_late['U'] > 0: 
            return 'U'
        
        return 'PU'
    
    if cts_late['U'] > 0 or cts_late['PU'] >0:
        return 'PU'
    return 'NU'
    
    
            
    if cts['PU'] > 0:
        if cts_late['U'] > 0:
            return 'U'
        if cts_late['PU'] > 0 :
            return 'PU'
        return 'NU'
    else:
        if cts_late['U'] > 0:
            return 'PU'
        if cts_late['NU'] > 0:
            return 'NU'
        return 'NU'
        
        
#return multilabel classification labels for comment quality
def get_label_comment_classification(c):
    res = [0,0,0]
    if c[8] or c[9]:
        res[0] = 1
    if c[10] or c[11]:
        res[1] = 1
    if np.any(c[17:30]):
        res[2] = 1
    
    return res
    
        


In [5]:
# Support to read either csv or excel file - annotations sheet
if FILE_PATH[-3:] == 'csv':
    exl_file = pd.read_csv(FILE_PATH,delimiter='$')
else:
    exl_file = pd.read_excel(FILE_PATH)
exl_file.head()

Unnamed: 0,Filename,Comment text,Start line,End line,No. of words,Program Domain Concepts,Problem Domain Concepts,Copyright/License,Bug/Fix/Patch/Version,Build,...,C22,C23,C24,C25,C26,C27,C28,C29,C30,Score
0,repos/libpng-code/pngset.c,pngset.c - storage of image information into i...,2,17,116,"{'storage': ['storag', 'Time Complexity / Spac...","['image', 'libpng']",True,False,False,...,,,,,,,1.0,,,U
1,repos/libpng-code/pngset.c,override with app values,62,61,4,"{'values': ['valu', 'Data-Structure and its Co...",[],False,False,False,...,,,,,,,,,,P
2,repos/libpng-code/pngset.c,FLOATING_POINT,133,133,1,{},[],False,False,False,...,,,,,,,,,,N
3,repos/libpng-code/pngset.c,cHRM,135,135,1,{},[],False,False,False,...,,,,,,,,,,P
4,repos/libpng-code/pngset.c,eXIf,182,182,1,{},[],False,False,False,...,,,,,,,,,,P


In [6]:
# printing the data of first row
exl_np = np.array(exl_file)
for i,el in enumerate(exl_np[0]):
    print(i," : ",el)

0  :  repos/libpng-code/pngset.c
1  :  pngset.c - storage of image information into info struct
 *
 * Copyright (c) 2018 Cosmin Truta
 * Copyright (c) 1998-2018 Glenn Randers-Pehrson
 * Copyright (c) 1996-1997 Andreas Dilger
 * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
 *
 * This code is released under the libpng license.
 * For conditions of distribution and use, see the disclaimer
 * and license in png.h
 *
 * The functions here are used during reads to store data from the file
 * into the info struct, and during writes to store application data
 * into the info struct for writing into the file.  This abstracts the
 * info struct and allows us to change the structure in the future.
2  :  2
3  :  17
4  :  116
5  :  {'storage': ['storag', 'Time Complexity / Space Complexity/ Memory/ Exception'], 'struct': ['structur', 'Data-Structure and its Components'], 'group': ['group', 'Operations as part of Data structure'], 'for': ['for', 'Operations as part of Algorithms'], 'dis

In [7]:
# selecting the required columns - comment and the annotation labels + annotated comment quality score (PU, U, NU)
classes = exl_np[:,[1]+list(range(ANNOTATION_CLASS_START,ANNOTATION_CLASS_START + ANNOTATION_CLASS_NUM))]
classes[0]

array(['pngset.c - storage of image information into info struct\n *\n * Copyright (c) 2018 Cosmin Truta\n * Copyright (c) 1998-2018 Glenn Randers-Pehrson\n * Copyright (c) 1996-1997 Andreas Dilger\n * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.\n *\n * This code is released under the libpng license.\n * For conditions of distribution and use, see the disclaimer\n * and license in png.h\n *\n * The functions here are used during reads to store data from the file\n * into the info struct, and during writes to store application data\n * into the info struct for writing into the file.  This abstracts the\n * info struct and allows us to change the structure in the future.',
       nan, nan, nan, 1.0, nan, nan, nan, nan, 1.0, nan, 1.0, nan, nan,
       1.0, nan, 1.0, nan, nan, nan, nan, 1.0, nan, nan, nan, nan, nan,
       nan, 1.0, nan, nan], dtype=object)

In [8]:
'''
Cases to rectify the manually annotated sheet data
for eg. treating empty string or \n as False, treating 1 as True
'''
for j, c in enumerate(classes):
    for i in range(1,31):
        try:
            if c[i] != c[i]:
                c[i] = False
            elif c[i] == ' ' or c[i] == '\n' or c[i] == 'False':
                c[i] = False
            elif int(float(c[i])) == 1:
                c[i] = True
            else:
                c[i] = False
        except Exception as e:
            print(j, i, e)

In [9]:
classes[6]

array(['TODO: validate format of calibration name and unit name', False,
       False, True, False, False, False, False, True, False, True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False], dtype=object)

In [10]:
'''
Here, for each comment in the data, the rules are applied and information is stored
labels stores the class for the comment
matching_rules stores the list of rules which were True
labels_comment_classification stores the multilabel classification triplet
'''
labels = []
labels_comment_classification = []
matching_rules = []
for c in classes:
    labels.append(MAP[get_label(c)])
    matching_rules.append(','.join(get_matching_rules(c)))
    labels_comment_classification.append(get_label_comment_classification(c))

In [11]:
# Adding the new calculated quality score or label based on ground truth generation rules to annotation excel file sheet.
# The annotated quality score or label is already available in the annotation excel file 
exl_file['Calculated Score New'] = labels
exl_file['Matching Rules'] = matching_rules
exl_file['Comment Type'] = labels_comment_classification

In [12]:
# Saving the excel data
# Changes done to annotation sheet - 3 new columns were added : 'Calculated Score New', 'Matching Rules', 'Comment Type' 
if FILE_PATH[-3:] == 'csv':
    exl_file.to_csv(OUTPUT_FILE_PATH,index=False,sep='$')
else:
    exl_file.to_excel(OUTPUT_FILE_PATH, index=False)