In [6]:
import pandas as pd
import numpy as np
import math
import os
import time
import re

In [27]:
class Tarantula(object):

    def __init__(self, directory):

        self.__M = []  # boolean. coverage matrix
        self.__MT = []
        self.__F = []  # boolean. failing test cases
        self.__C = []
        self.__numOrigTests = 0
        self.__numStmts = 0
        self.__totalOrigFail = 0
        self.__totalOrigPass = 0
        self.__passOnStmt = []  # int. p(s), for every s and considering liveness of test cases
        self.__failOnStmt = []  # int. p(s), for every s and considering liveness of test cases
        self.__passRatio = []
        self.__failRatio = []
        self.__suspiciousness = []
        self.__directory = directory  # to store the coverage matrix file
        self.__ranking = []
        self.__fileName = []
        self.__rankData = []
        self.__results = pd.DataFrame()


    def main(self, name):
        
        # read file iterately under the root directory
        list = os.listdir(self.__directory) #list out all documents under the current directory
       #  print(len(list))
    
        for i in range(0,len(list)):
            start = time.time()
            self.__suspiciousness = []
            self.__fileName.insert(i, list[i])
            print("\n ========================= {0} The Result ========================= \n".format(list[i]))
            path = os.path.join(self.__directory,list[i]) 
            if os.path.isfile(path):
                self.__M = pd.read_csv(path, sep=',', index_col=0)
                # initialize all case are live
                self.TarantulaSuspiciousnessCalculation(self.__M)

                # initialize the fail and pass case number
                self.setF(self.__M.r)

                # main computation
                self.mainCompute()
                
                # create csv file to store the result to calculate the exam score
                #self.__MT.loc[:,['Suspiciousness','Rank']].to_csv("result_{0}".format(list[i]))
                if(self.__results.empty):
                    self.__results = self.__MT.loc[:,['Suspiciousness','Rank']]
                else:
                    self.__results = pd.merge(self.__results,self.__MT.loc[:,['Suspiciousness','Rank']],left_index=True,right_index=True)
                
                
            end = time.time()
            #self.__results.to_csv("result_".format(list[i]))
            print("The time comsume {0} s".format(round((end-start), 4)))
        self.__results.to_csv("{0}.csv".format(name))

            
    def TarantulaSuspiciousnessCalculation(self, M):
        self.__M = M

        self.__numOrigTests = self.__M.shape[0]
        # print(self.__numOrigTests)
        self.__numStmts = self.__M.shape[1] - 1
        # print(self.__numStmts)

    def setF(self, F):
        self.__F = F.tolist()
        self.calculateOrigFailAndPass()

    def mainCompute(self):
        self.calculatePassOnStmtAndFailOnStmt()
        self.calculatePassRatioAndFailRatio()
        self.calculateSuspiciousness()
        self.calculateRanking()
    
    def calculatePassOnStmtAndFailOnStmt(self):
        
        for i in range(self.__numStmts):
            self.__C = self.__M.iloc[:, i]
            self.__failOnStmt.insert(i, self.__C.tolist().count(0))
            self.__passOnStmt.insert(i, self.__C.tolist().count(1))

    def calculatePassRatioAndFailRatio(self):

        for i in range(self.__numStmts):
            if self.__totalOrigPass == 0:
                self.__passRatio.insert(i, 0)
            else:
                self.__passRatio.insert(i, self.__passOnStmt[i] / self.__totalOrigPass)

            if self.__totalOrigPass == 0:
                self.__failRatio.insert(i, 0)
            else:

                self.__failRatio.insert(i, self.__failOnStmt[i] / self.__totalOrigFail)

    def calculateSuspiciousness(self):
        
        for i in range(self.__numStmts):
            if (self.__totalOrigFail == 0) & (self.__totalOrigPass == 0):
                self.__suspiciousness.insert(i, -1)
            elif (self.__failRatio[i] == 0) & (self.__passRatio[i] == 0):
                self.__suspiciousness.insert(i, -1)
            elif (self.__passRatio[i] == 0):
                self.__suspiciousness.insert(i, 0)
            else:
                self.__suspiciousness.insert(i, round(self.__failRatio[i] / (self.__failRatio[i] + self.__passRatio[i]), 2))

    def calculateOrigFailAndPass(self):
        
        for i in range(self.__numOrigTests):
            if self.__F[i]:

                self.__totalOrigPass += 1
            else:

                self.__totalOrigFail += 1

    def calculateRanking(self):
        # print whole list with suspiciousness and rank
        self.__MT = self.__M.T
        self.__suspiciousness.append(np.nan)
        self.__suspiciousness = pd.Series(self.__suspiciousness, index=[var for var in self.__M.columns])
        self.__ranking = self.__suspiciousness.rank(ascending=False, method="min")

        self.__MT["Suspiciousness"] = self.__suspiciousness
        self.__MT['Rank'] = self.__ranking
        self.__MT.sort_values("Rank", inplace=True)
        self.__MT.drop('r',inplace=True)
        
        # self.__rankData.append(self.__MT.head(5).index.tolist())
        

        # print(self.__MT.loc[:,['Suspiciousness','Rank']])
        
        
        
    def getExamScore(self, s):
        rank = self.__MT['Rank'].loc[s]
        
        examScore = round(rank-1/self.__numStmts,2) * 100
        
        return examScore

In [29]:
if __name__ == "__main__":
    
    rootDirSort100 = r"C:\Users\Chicago Lam\jupyter\sort_csv\sort_100"
    rootDirSort300 = r"C:\Users\Chicago Lam\jupyter\sort_csv\sort_300"
    rootDirSortMultiple = r"C:\Users\Chicago Lam\jupyter\sort_csv\sort_100_multiple_bugs"

    sort100 = Tarantula(rootDirSort100)
    sort100.main("sort100")
    
    sort300 = Tarantula(rootDirSort300)
    sort300.main("sort300")
    
    sortMultiple = Tarantula(rootDirSortMultiple)
    sortMultiple.main("sortMultiple")

    rootDirEncrypt100 = r"C:\Users\Chicago Lam\jupyter\encrypt_csv\encrypt_100"
    rootDirEncrypt300 = r"C:\Users\Chicago Lam\jupyter\encrypt_csv\encrypt_300"
    rootDirEncryptMultiple = r"C:\Users\Chicago Lam\jupyter\encrypt_csv\encrypt_100_multiple_bugs"
    
    encrypt100 = Tarantula(rootDirEncrypt100)
    encrypt100.main("encrypt100")
    
    encrypt300 = Tarantula(rootDirEncrypt300)
    encrypt300.main("encrypt300")
    
    encryptMultiple = Tarantula(rootDirEncryptMultiple)
    encryptMultiple.main("encryptMultiple")
    

SyntaxError: EOL while scanning string literal (<ipython-input-29-98b0c8423ab6>, line 27)