In [1]:
from pathlib import Path
from os import listdir
from os.path import isfile, join
import json
import statistics
from scipy.stats import entropy
import numpy as np
import pandas as pd


#Global Variables
p_tc_path = Path('./../Data/Processed_Transcripts')

In [3]:
def get_data(transcript_number):
    """Takes transcript number and returns a data dictionary for the corresponding transcript"""
    name = ''
    if transcript_number/10 < 1:
        name = 'p_tc_' + '00' + str(transcript_number) + '.json'
    elif transcript_number/100 < 1:
        name = 'p_tc_' + '0' + str(transcript_number) + '.json'
    else:
        name = 'p_tc_' + str(transcript_number) + '.json'
    file_name = p_tc_path / name
    with open(file_name, 'r') as file:
        data = json.load(file) #ensure_ascii=False ensures us that accents and ~ are maintained
    return data

def calc_total(data):
    """Calculates the total number of words said in the video"""
    return sum(data.values())

def calc_filler(data):
    """Calculates the number of 'Filler' words said in the video"""
    num = 0
    for w, n in data.items():
        if w in ['ah', 'eh', 'hm', 'mm', 'm', 'aa']:
            num += n
    return num

def calc_unique(data):
    """Calculates the number of unique words said in the video"""
    return len(data.keys())

def calc_average(data):
    """Calculates the average number of times a word was used in the video"""
    return sum(data.values()) / len(data.values())

def calc_median(data):
    """Calculates the median number of times a word was used in the video"""
    return statistics.median(data.values())

def calc_entropy(data):
    """Calculates the entropy of the vocabulary used in the video"""
    word_counts = np.array(list(data.values())) #makes it easier to do the probabilities calculation
    probabilities = word_counts / word_counts.sum()
    return entropy(probabilities, base=2)

def calc_ttr(data):
    """Calculates the TTR (unique words / total words) for the data"""
    return len(data.values()) / sum(data.values())

In [9]:
#Load every associated json file, calculate measurements, save to pandas dataframe, and export to csv

# List to store all results
data_list = []


for video_number in range(112): 
    word_dict = get_data(video_number + 1)  # Load word frequency dictionary

    #Calculate results and append as a dictionary
    data_list.append({
        "Video_Number": video_number+1,
        "Total_Words": calc_total(word_dict),
        "Filler_Words": calc_filler(word_dict),
        "Unique_Words": calc_unique(word_dict),
        "Avg_Word_Freq": calc_average(word_dict),
        "Med_word_Freq": calc_median(word_dict),
        "Entropy": calc_entropy(word_dict),
        "TTR": calc_ttr(word_dict)
    })

#Convert to a dataframe
df = pd.DataFrame(data_list)

#Show first 5 rows to make sure things look fine
print(df.head())

#Save to csv file
df.to_csv("./../data/transcript_features.csv", index=False)

#Transcript Analysis done! Now we just have features, and we need to join and clean to the hand gathered data

   Video_Number  Total_Words  Filler_Words  Unique_Words  Avg_Word_Freq  \
0             1         1077           118           296       3.638514   
1             2          790            76           273       2.893773   
2             3          527            48           209       2.521531   
3             4          933           111           306       3.049020   
4             5          615            56           214       2.873832   

   Med_word_Freq   Entropy       TTR  
0            1.0  7.017286  0.274838  
1            1.0  6.966307  0.345570  
2            1.0  6.710372  0.396584  
3            1.0  6.954690  0.327974  
4            1.0  6.628696  0.347967  


In [47]:
print(calc_total(t))
print(calc_filler(t))
print(calc_unique(t))
print(calc_average(t))
print(calc_median(t))
print(calc_entropy(t))
print(calc_trr(t))

1077
118
296
3.6385135135135136
1.0
7.017285734407039
0.2748375116063138


In [82]:
#I am going to explore some different packages to see if a package can give me some easy calculations that look correct with a few different tests

# pyspellchecker checks whether or not a word was spelled correctly, and it has functionality in Spanish
# Idea- if a word is spelled incorrectly, it is mispronounced or not a word! 
#Let's see if it works

from spellchecker import SpellChecker

spell = SpellChecker(language='es')  # Spanish spell checker

def count_non_words(word_list):
    misspelled = spell.unknown(word_list)
    print(misspelled)
    print(spell.known(word_list))
    return len(misspelled)

transcribed_words = list(t.keys())
non_word_count = count_non_words(transcribed_words)

print(f"Non-Spanish Words: {non_word_count}/{len(transcribed_words)}")
print(f"Error Rate: {non_word_count / len(transcribed_words) * 100:.2f}%")

#The first list is words that are misspelled. It includes many words that are correctly spelled
#The second list is words that are spelled correctly. It has "ah", which I'm pretty sure is not a word

#This package is not functional for my purposes

{'voy', 'tuve', 'spanish', 'terminé', 'noah', 'galicia', 'viviendo', 'qué', 'carapate', 'dedicarme', 'palabras', 'tenía', 'pasando', 'necesita', 'mirando', 'has', 'leyendo', 'viendo', 'recordando', 'covid', 'c', 'hm', 'semanas', 'matemáticas', 'youtube', 'meses', 'estudiando', 'sé', 'trabajadores', 'todos', 'vídeo', 'podcast', 'aprendiendo', 'tiene', 'estamos', 'nios', 'prácticas', 'estaba', 'mm', 'idiomas', 'todas', 'terminado', 'loo', 'libros', 'cco', 'quiero', 'usando', 'restaurantes', 'fue', 'trabajando', 'adquiriendo', 'chao', 'vídeos', 'horas', 'iguales', 'empez', 'hace', 'eres', 'dónde', 'minutos', 'estudios', 'dreaming', 'frases', 'puede', 'informática', 'emocionado', 'sabía', 'llegué', 'tr', 'quiere', 'quisiera', 'cómo', 'haciendo', 'ar', 'periodo', 'contando', 'cosas', 'días', 'problemas', 'siento', 'quería', 'empezando', 'video', 'categorías', 'intercambiar', 'puedo', 'axiliar', 'tiendas', 'entiendo', 'use', 'gradué', 'primeras', 'muchas', 'm', 'entendiendo', 'necesito', 'hi

In [21]:
#I have a dictionary txt file for spanish. Let's see what happens when I use that as my "truth" for if a word is a word

with open("spanish_words.txt", "r", encoding="utf-8") as file:
    spanish_words = {word.strip() for word in file}

words = []
not_words = []

for word in t.keys():
    if word in spanish_words:
        words.append(word)
    else:
        not_words.append(word)

print(not_words)
print(words)

#This doesn't work either, estoy is a word but it isn't showing conjugations or plural adjectives sadly :(
#Looks like we will do a manual examination of what words are filler words and use those for our measurement

['es', 'noah', 'esta', 'estoy', 'haciendo', 'eso', 'está', 'terminado', 'cosas', 'hace', 'hm', '5', 'meses', 'aprendiendo', 'quería', 'empezando', 'a', 'semanas', 'empez', 'estaba', 'periodo', 'necesito', 'nios', 'palabras', 'quiero', 'mis', 'frases', 'hablando', 'dónde', 'tengo', 'mm', '600', 'horas', 'categorías', 'carapate', 'empecé', 'estudiando', 'unos', 'tiene', 'prácticas', 'problemas', 'historias', 'loo', '224', '28', '2024', '7', 'estamos', 'viviendo', 'galicia', 'ella', 'axiliar', 'trabajando', 'gradué', 'tuve', 'terminé', 'sabía', 'fue', 'covid', 'podré', 'dedicarme', 'estudios', 'matemáticas', 'todos', 'todas', 'iguales', 'esty', 'usando', 'dreaming', 'spanish', 'mirando', 'viendo', 'vídeos', '92', '300', '17', 'podcast', '36', 'leyendo', 'libros', '14', 'llegué', 'pude', 'trabajadores', 'tiendas', 'restaurantes', 'preguntándome', 'traté', 'personas', 'idiomas', 'quisiera', 'necesita', 'adquiriendo', 'eres', 'has', 'siento', '55', '160', 'primeras', '120', 'contando', 'enti

In [57]:
#Idea- search from smallest to largest words. Identify the buffer words (which are between 1-2 letters)
#I'm going to join the every dictionary, sort the words, and also do some overall measurements just to have

#Join all video dictionaries
super_dict = {}

for i in range(112):
    d = get_data(i+1)
    for w, n in d.items():
        if w in super_dict:
            super_dict[w] += n
        else:
            super_dict[w] = n

#Basic statistics of the whole spoken dataset
print(calc_total(super_dict))
print(calc_filler(super_dict))
print(calc_unique(super_dict))
print(calc_average(super_dict))
print(calc_median(super_dict))
print(calc_entropy(super_dict))
print(calc_trr(super_dict))

#only print keys with less than 3 characters
word_list = []
for w, n in super_dict.items():
    if len(w) < 3:
        word_list.append(w)

print(word_list)

#List of filler words I'll count: ah, eh, hm, mm, m, aa

#Note: There are words that don't make sense here, like oj, fr, ch, etc. I've noticed that sometimes a word like "nos" is written as "no s". 
#Some of these letters can be from situations like that, mispronunciations, me saying letters, etc.
#There is a pretty big asterisk in general that comes from the fact that the transcripts are auto generated, my accent isn't what it was prepared for,
#   and the recording situation was not the highest quality (not talking too loud many times because it is late and people in the house are asleep)
#With high powered NLP, this could be resolved after a lot of work, research, and thought. But that isn't the purpose of this project! The purpose
#   is to showcase my analysis abilities, not my NLP skills

#How many words are filler?
num = 0
for w, n in super_dict.items():
    if w in ['ah', 'eh', 'hm', 'mm', 'm', 'aa']:
        num += n

print("Number of Filler Words:", num)

101856
7969
6064
16.796833773087073
1.0
8.153054312296534
0.05953502984605718
['mi', 'es', 'y', 'de', 'ah', 'la', 'eh', 'hm', '5', 'o', 'yo', 'a', 'en', 'un', 'no', 'ar', 'mm', 'm', 'el', 'sí', '28', '7', 'me', '92', '17', '36', '14', 'si', 'tú', 'lo', '55', 'e', 'he', 'tr', '95', 'c', '11', '4', '6', '8', 'sé', 'ha', '29', 'sp', '90', '2', 'mí', 'se', 'in', '10', 'le', 'él', '20', 'ir', '70', '30', 'h', 'ía', 'ac', '1', 'su', 'b', 'v', 'al', '3', 'va', '31', 'pu', 'ho', 'ti', 'a3', 'h3', '16', 'd', 'f', 'g', 's', '15', 'é', '60', 'os', '25', 'oy', 'vi', '13', 'tu', 'bp', 'g3', 'n', 'l', '40', '50', '24', 'te', '80', 'so', '9', 'nu', '74', 'u', 'ya', '44', '45', 'io', 'bz', '18', 've', '12', 'oí', 'i', 'di', 'gg', 'có', 'uf', 'oj', 'ej', '2s', '19', 'c2', 'f3', '21', 'bo', 'oo', 'pé', '22', 'oh', '23', '33', 'uh', 'gr', '26', 'má', 'pr', '27', 'mo', 'yi', 't', 'qu', 'da', 'hd', 'ay', 'fr', 'on', 'aí', 'st', 'ó', '$', 'lr', 'í', 'x', 'xx', 'ni', 'cl', 'do', 'p', '35', 'r', '34', '42', '