In [1]:
import pandas as pd
import numpy as np
import csv
from os.path import join as PJOIN
import copy

In [2]:
"""
Configuration parameters to change - 
DATA_DIR : Directory where outputs X,Y, Z of PrepareTrainingData notebook is present
DATA_FILES : Output file names from PrepareTrainingData notebook
OUTPUT_FILE : Name of the merged final output file 
Two sheets will be generated
1. Z_<OUTPUT_FILE>.csv - This will contain the file name and the comment name
2. <OUTPUT_FILE>.csv - This will contain the 20 features + the calculated comment quality score or label for the comments 
                       in the same order as present in Z_<OUTPUT_FILE>.csv
"""
DATA_DIR = "DATA/GENERATED/TRAIN/"
commentTypeClassification = False
# DATA_FILES = ["train_libpng_int.csv","train_dealii_int.csv","train_server_int.csv",
#              "train_curl_Deepesh_commentType.csv", "train_curl_Saket_commentType.csv", 
#               "train_curl_Saloni_commentType.csv", "train_curl_Yash_commentType.csv",
#              "train_server_Saket_commentType.csv", "train_server_Shubhanan_commentType.csv",
#              "train_server_Yash_commentType.csv"]

# DATA_FILES = ["train_dealii_Srinidhi_int.csv", "train_libpng_Someone_int.csv", "train_server_Someone_int.csv",
#              "train_curl_Deepesh_commentType.csv", "train_curl_Saket_commentType.csv", 
#               "train_curl_Saloni_commentType.csv", "train_curl_Yash_commentType.csv",
#              "train_server_Saket_commentType.csv", "train_server_Shubhanan_commentType.csv",
#              "train_server_Yash_commentType.csv"]

# DATA_FILES = ['train_curl_Deepesh_cal.csv', 'train_curl_Saket_cal.csv', 'train_curl_Saloni_cal.csv',
#              'train_curl_Yash_cal.csv', 'train_PLplot_Shubhanan_cal.csv', 'train_PLplot_Saloni_cal.csv',
#              'train_server_Saket_cal.csv','train_server_Yash_cal.csv','train_server_Shubhanan_cal.csv',
#              'train_dealii_Srinidhi_cal.csv','train_server_3folders_cal.csv','train_libpng-code_allmarked_cal.csv']
DATA_FILES = ['train_libpng_billgates_cal.csv']
OUTPUT_FILE = "TEST_SEPARATE"

HEADERS = ["count of comment tokens", "software development count", " application specific entities count", " descriptive", "operational / conditional",
          "non DT IN words", "coherence inconsistent", "coherence redundant", "scope score",
          "application specific", "developer details", "junk/copyright",'dataset description','working summary','working summary - design','exceptions','build instructions','project management','construct names in comment','comment placements','Class']

In [3]:
"""
This function takes all data files, and concatenates their data to get all_x, all_y and all_z
"""
SHAPES = set()
def get_all_training_data():
    all_files = []
    if DATA_FILES[0] == 'all':
        for file in os.listdir(DATA_DIR):
            if file[:2] == 'X_':
                all_files.append(file[2:])
    else:
        all_files = DATA_FILES
    
    all_x = []
    all_y = []
    all_z = []
    
    for file in all_files:
        train_x = pd.read_csv(PJOIN(DATA_DIR,"X_"+file),header=None)
        all_x.append(np.array(train_x))
        train_y = pd.read_csv(PJOIN(DATA_DIR,"Y_"+file),header=None)
        all_y.append(train_y)
        train_z = pd.read_csv(PJOIN(DATA_DIR,"Z_"+file),header=None,delimiter='\t')
        print(train_z[[1,0]])
        SHAPES.add(train_z.shape)
        all_z.append(train_z[[1,0]])
    
    all_x = np.concatenate(all_x)
    all_y = np.concatenate(all_y)
    all_z = np.concatenate(all_z)
    print(all_x.shape,all_y.shape)
    all_y = all_y.reshape(all_y.shape[0])    
    return all_x, all_y, all_z

In [4]:
"""
This function takes the label data (Y), and returns it as np.array
"""
def process_y_commentType(y):
    res = []
    for el in y:
        temp = el.strip('][').split(',')
        temp = [int(a) for a in temp]
        res.append(temp)
    return np.array(res)

In [5]:
train_x, train_y, train_z = get_all_training_data()

                                                     1  \
0    Define the following to use this test against ...   
1                 because png.h did *not* include this   
2    1.6.1 added support for the configure test har...   
3         READ_PNG and WRITE_PNG were not defined, so:   
4                                      SEQUENTIAL_READ   
..                                                 ...   
213                                      Not an option   
214                    abort on user or internal error   
215  Here on any return, including failures, except...   
216                           Release allocated memory   
217                                          !READ_PNG   

                                                 0  
0    repos/libpng-code/contrib/libtests/pngimage.c  
1    repos/libpng-code/contrib/libtests/pngimage.c  
2    repos/libpng-code/contrib/libtests/pngimage.c  
3    repos/libpng-code/contrib/libtests/pngimage.c  
4    repos/libpng-code/contrib/libtest

In [13]:
## Writing the merged Z output in file Z_OUTPUT_FILE
with open(DATA_DIR+"Z_"+OUTPUT_FILE+".csv", 'w') as f:
    writer = csv.writer(f, delimiter = '\t')
    header = ['F2','FILES']
    writer.writerow(header)
    for el in train_z:
        writer.writerow(el)

In [14]:
## Writing the merged X and Y output in file OUTPUT_FILE
with open(DATA_DIR+OUTPUT_FILE+".csv", 'w') as f:
    writer = csv.writer(f, delimiter = '\t')
    writer.writerow(HEADERS)
    for j,l in enumerate(train_x):
        writer.writerow(np.append(l,train_y[j]))