In [1]:
from pathlib import Path
from os import listdir
from os.path import isfile, join
import json

#global variables
r_tc_path = Path('./../Data/Raw_Transcripts')
p_tc_path = Path('./../Data/Processed_Transcripts')

In [3]:
#Functions to transform script from nothing to json formatted dictionaries

#returns the name of the raw transcript file
def get_r_file(transcript_number):
    """Takes the requested number and returns the contents of the corresponding transcript"""
    name = ''
    if transcript_number/10 < 1:
        name = 'r_tc_' + '00' + str(transcript_number) + '.txt'
    elif transcript_number/100 < 1:
        name = 'r_tc_' + '0' + str(transcript_number) + '.txt'
    else:
        name = 'r_tc_' + str(transcript_number) + '.txt'
    file_path = r_tc_path / name
    contents = file_path.read_text(encoding='utf-8') #very important to use utf-8 or else the accents and ~ become lost
    return contents

#getting rid of time stamps
def time_stamp_removal(content):
    """Takes the contents of the file and removes the timestamps. Returns as a list"""
    lines = content.splitlines()
    list = []
    for line in lines:
        if ':' not in line and (line[0] != '[' and line[-1] != ']'):
            #print(line)
            list.append(line.strip()) #remove leading and trailing whitespace to not interfere with word splitting later
    return list
            
#turning the transcript into a dictionary with a count of every word occurence
def transcript_to_dictionary(content):
    """Takes given content (in the form of a list) and creates a dictionary counting the number of times each word was used"""
    words = {}
    for line in content:
        for word in line.split(): #splits line into a list of words which are iterated through
            if word.lower() in words:
                words[word.lower()] += 1
            else:
                words[word.lower()] = 1
    return words

#saves words dictionary into json file
def dictionary_to_json(words, transcript_number):
    """Takes a dictionary and saves it to a json file with a name corresponding to the transcript that said dictionary was made from"""
    name = ''
    if transcript_number/10 < 1:
        name = 'p_tc_' + '00' + str(transcript_number) + '.json'
    elif transcript_number/100 < 1:
        name = 'p_tc_' + '0' + str(transcript_number) + '.json'
    else:
        name = 'p_tc_' + str(transcript_number) + '.json'
    file_name = p_tc_path / name
    with open(file_name, "w") as outfile: 
        json.dump(words, outfile, ensure_ascii=False) #ensure_ascii=False ensures us that accents and ~ are maintained

In [7]:
#code to process all transcripts
for i in range(112):
    contents = get_r_file(i+1)
    contents = time_stamp_removal(contents)
    words = transcript_to_dictionary(contents)
    dictionary_to_json(words, i+1)

In [49]:
#DO NOT RUN AGAIN. WILL NOT ERASE EXISTING FILES BUT IS HERE FOR DOCUMENTATION PURPOSES

#make all the files to paste data from YouTube into

#there are 112 videos, we will create a file for each
for i in range(1,113):
    name = ''
    if i/10 < 1:
        name = 'r_tc_' + '00' + str(i) + '.txt'
    elif i/100 < 1:
        name = 'r_tc_' + '0' + str(i) + '.txt'
    else:
        name = 'r_tc_' + str(i) + '.txt'
    file_path = r_tc_path / name
    if file_path.exists():
        print(f"The file {name} already exists")
    else:
        f = open(file_path, "w")
        print(f"The file {name} was created")

The file r_tc_001.txt already exists
The file r_tc_002.txt already exists
The file r_tc_003.txt already exists
The file r_tc_004.txt already exists
The file r_tc_005.txt already exists
The file r_tc_006.txt already exists
The file r_tc_007.txt already exists
The file r_tc_008.txt already exists
The file r_tc_009.txt already exists
The file r_tc_010.txt already exists
The file r_tc_011.txt already exists
The file r_tc_012.txt already exists
The file r_tc_013.txt already exists
The file r_tc_014.txt already exists
The file r_tc_015.txt already exists
The file r_tc_016.txt already exists
The file r_tc_017.txt already exists
The file r_tc_018.txt already exists
The file r_tc_019.txt already exists
The file r_tc_020.txt already exists
The file r_tc_021.txt already exists
The file r_tc_022.txt already exists
The file r_tc_023.txt already exists
The file r_tc_024.txt already exists
The file r_tc_025.txt already exists
The file r_tc_026.txt already exists
The file r_tc_027.txt already exists
T