In [4]:
#made by: Eloise Bisbee 8/13/19 eloise.bisbee@tufts.edu
#this program takes in files numbered 1-numfiles with a .txt extension as
#well as ground_truth_ratings.csv and output clean_comedy_data.csv.

#The numbered files should contain praat output data formatted as follows:
#first line: full file path of which folder data was extracted from
#then a series of couplings:
#pause_#.mp3
#time pitch intensity
#time pitch intensity
#etc. for entire length of pause_#.mp3 clip

#ground_truth_ratings should contain the ground truth ratings for each comedy
#performance formatted such that the first couple columns are:
#PerformanceId, Performance, JokeId, Joke, HumanScore, HumanScorePostJokeOnly

#the output, clean_comedy_data.csv, will contain columns:
#'PerformanceId', 'JokeId', 'TimeStamp', 'Pitch', 'Intensity', 'HumanScore', 'HumanScorePostJokeOnly'
import re
import csv

#reading in data from praat files and storing in dictionary called all_data
num_files = 24
all_data = {}
for j in range(1, num_files + 1):
    filename = str(j) + ".txt"
    praat = open(filename, "r")
    raw_praat_data = praat.readlines()
    to_add = {}
    to_add["performance"] = raw_praat_data[0].split("/")[5]
    to_add["data"] = {}
    temp = []
    for i in range(1, len(raw_praat_data)):
        if raw_praat_data[i][0] == 'p':  
            temp = []
            index = int(re.findall(r'\d+', raw_praat_data[i])[0])
            to_add['data'][index] = temp
        else:
            dat = raw_praat_data[i].split(" ")
            for i in range(0, len(dat)):
                if len(re.findall(r'\d*\.?\d+', dat[i])) != 0:
                    dat[i] = float(re.findall(r'\d*\.?\d+', dat[i])[0])
                else:
                    dat[i] = 0
            temp.append(dat)
    all_data[j] = to_add
    praat.close()

#reading in data from csv file and storing it 
csv_data = {}
with open('ground_truth_ratings.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    data_cats = []
    count = 0
    curr = -1
    temp = {}
    for row in csv_reader:
        if count == 0:
            data_cats = row
        else:
            if curr != int(row[0]):
                curr = int(row[0])
                temp = {}
                csv_data[row[1]] = temp
                temp['PerformanceId'] = int(row[0])
                temp['Performance'] = row[1]
                temp['Jokes'] = []
                temp['Jokes'].append([int(row[2]), row[4], row[5]])
            else:
                temp['Jokes'].append([int(row[2]), row[4], row[5]]) 
        count += 1
        
performance_key = {
    19: "avo", 16: "col", 5: "fla", 6: "gig7", 22: "gig12", 
    21: "has", 23: "hid", 2: "pas", 1: "qua", 20: "shi",
    18: "ski", 7: "sur", 
    "avo": 19, "col": 16, "fla": 5, "gig7": 6, "gig12": 22,
    "has": 21, "hid": 23, "pas": 2, "qua": 1, "shi": 20,
    "ski": 18,"sur": 7
}

#now output csv file with all appropriate data
with open('clean_comedy_data_ts.csv', mode='w') as csv_file:
    fieldnames = ['PerformanceId', 'JokeId', 'TimeStamp', 'Pitch', 'Intensity', 'HumanScore', 'HumanScorePostJokeOnly']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for elem in all_data:
        id = all_data[elem]['performance']
        if len(all_data[elem]['data']) == len(csv_data[id]["Jokes"]):
            for i in range(0, len(all_data[elem]['data'])):
                for j in range(0, len(all_data[elem]['data'][i])):
                    row = {}
                    row["PerformanceId"] = csv_data[id]['PerformanceId']
                    row['JokeId'] = csv_data[id]['Jokes'][i][0]
                    row['TimeStamp'] = all_data[elem]['data'][i][j][0]
                    row['Pitch'] = all_data[elem]['data'][i][j][1]
                    row['Intensity'] = all_data[elem]['data'][i][j][2]
                    row['HumanScore'] = csv_data[id]['Jokes'][i][1]
                    row['HumanScorePostJokeOnly'] = csv_data[id]['Jokes'][i][2]
                    writer.writerow(row)
        else:
            #this else case is for if the raw praat data lengths and ground
            #truth lengths which should correspond to the same performance 
            #are not the same length. This means something went wrong
            print("uhoh")


