In [2]:
#made by: Eloise Bisbee 8/13/19 eloise.bisbee@tufts.edu
#this program takes in files numbered 1-numfiles with a .txt extension as
#well as ground_truth_ratings.csv and output clean_comedy_data.csv.

#The numbered files should contain praat output data formatted as follows:
#first line: full file path of which folder data was extracted from
#then a series of couplings:
#pause_#.mp3
#intensity
#pitch

#ground_truth_ratings should contain the ground truth ratings for each comedy
#performance formatted such that the first couple columns are:
#PerformanceId, Performance, JokeId, Joke, HumanScore, HumanScorePostJokeOnly

#the output, clean_comedy_data.csv, will contain columns:
#'PerformanceId', 'JokeId', 'Pitch','Intensity', 'HumanScore', 'HumanScorePostJokeOnly'

import re
import csv

#reading in data from praat files and storing in dictionary called all_data
num_files = 24
all_data = {}
for fi in range(1, num_files + 1):
    filename = str(fi) + ".txt"
    praat = open(filename, "r")
    raw_praat_data = praat.readlines()
    praat_data = {}
    to_add = {}
    to_add["performance"] = raw_praat_data[0].split("/")[5]
    to_add["data"] = praat_data
    features_per_pause = 2
    #print(raw_praat_data)
    for i in range(0, len(raw_praat_data)):
        #print(raw_praat_data[i])
        if raw_praat_data[i][0] == 'p':
            row_data = []
            for j in range(0,features_per_pause):
                raw = re.findall(r'\d*\.?\d+', raw_praat_data[i + j + 1])
                if len(raw) != 0:
                    row_data.append(float(raw[0]))
                else:
                    row_data.append(0)
            temp = re.findall(r'\d+', raw_praat_data[i])
            index = int(temp[0])
            praat_data[index] = row_data
    all_data[fi] = to_add
    praat.close()

#print(all_data)

#reading in data from csv file and storing it in csv_data 
csv_data = {}
with open('ground_truth_ratings.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    data_cats = []
    count = 0
    curr = "none"
    temp = {}
    for row in csv_reader:
        if count == 0:
            data_cats = row
        else:
            if curr != row[1]:
                curr = row[1]
                temp = {}
                csv_data[curr] = temp
                temp['PerformanceId'] = int(row[0])
                temp['Performance'] = row[1]
                temp['Jokes'] = []
                temp['Jokes'].append([int(row[2]), row[4], row[5]])
            else:
                temp['Jokes'].append([int(row[2]), row[4], row[5]]) 
        count += 1

#not important anymore
#performance_key = {
 #   "avo": 19, "col": 16, "fla": 5, "gig7": 6, "gig12": 22,
 #   "has": 21, "hid": 23, "pas": 2, "qua": 1, "shi": 20,
 #   "ski": 18,"sur": 7
#}
#print(all_data)
#now output csv file with all appropriate data
with open('clean_comedy_data.csv', mode='w') as csv_file:
    fieldnames = ['PerformanceId', 'JokeId', 'Pitch', 'Intensity', 
                  'HumanScore', 'HumanScorePostJokeOnly']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for elem in all_data:
        id = all_data[elem]['performance']
        print(all_data[elem]['data'])
        if len(all_data[elem]['data']) == len(csv_data[id]["Jokes"]):
            for i in range(0, len(all_data[elem]['data'])):
                row = {}
                row["PerformanceId"] = csv_data[id]['PerformanceId']
                row['JokeId'] = csv_data[id]['Jokes'][i][0]
                row['Pitch'] = all_data[elem]['data'][i][1]
                row['Intensity'] = all_data[elem]['data'][i][0]
                row['HumanScore'] = csv_data[id]['Jokes'][i][1]
                row['HumanScorePostJokeOnly'] = csv_data[id]['Jokes'][i][2]
                writer.writerow(row)
        else:
            #this else case is for if the raw praat data lengths and ground
            #truth lengths which should correspond to the same performance 
            #are not the same length. This means something went wrong
            print("uhoh")


{0: [71.36835476740207, 0], 1: [76.23359642086136, 233.20463580365225], 10: [71.52639893001587, 0], 11: [79.93529023189294, 235.6042445661262], 12: [71.44383165326407, 0], 13: [78.61854166286629, 157.41839119532375], 14: [79.1428514574443, 221.75972887108693], 15: [75.52608397407701, 225.7887006460963], 16: [72.72988066838782, 193.45287067459486], 2: [81.60643136748646, 248.37154112686315], 3: [81.53938101421622, 232.0575901318505], 4: [72.90457429878248, 148.81163541510057], 5: [72.80929348526358, 250.15751750809915], 6: [79.949359769092, 221.17971071848976], 7: [83.24895174415263, 247.41289644513154], 8: [72.61396486873676, 217.616918961424], 9: [72.17015036432367, 214.2593759223603]}
{0: [73.87416482488139, 173.59941414648668], 1: [78.58010299806463, 87.08728930894533], 10: [68.0309531405429, 0], 11: [70.87649120932207, 144.1785469710506], 12: [70.03723302831814, 181.76642968958245], 13: [77.5001798696288, 208.15453011352042], 14: [68.34413943970858, 0], 15: [71.05049346668928, 210.