In [9]:
import glob
import pickle
import operator
from collections import Counter
import os

#tokenization
def tokenize(inp):
    tokens = []  

    #remove punctuation marks
    punc_marks = [".", "?", "!", ",", "'", ":", ";", "-", "[", "]", "{", "}", "—", "…", "(", ")"]
    space_char = ["\n", "\t"]

    for punct in  punc_marks:
        inp = inp.replace(punct, "")

    for char in space_char:
        inp = inp.replace(char, " ")

    inp = inp.lower()
    split_words = inp.split(" ")

    for word in split_words:
        if word.strip() != "" and word != "eof": 
            tokens.append(word)

    return tokens

#lemmatization 
def lemmatize(tokens, lemma_dict):
    lemmas = []  

    for word in tokens:
        if word in lemma_dict:
            lemmas.append(lemma_dict[word]) 
        else:
            lemmas.append(word)  

    return lemmas

#calculating the frequency 
def freq_simple(tokens):
    return Counter(tokens)  

def process_transcript_files(dir_name, lemma_dict):
    word_counts = Counter()  

    filenames = glob.glob(os.path.join(dir_name, "*.txt"))  
    
    if not filenames:
        print(f"There are no transcript files found in directory: {dir_name}")
        return word_counts

    for filename in filenames:
        try:
            with open(filename, "r", errors="ignore") as file:
                while True:
                    line = file.readline()
                    if not line:  
                        break

                    tokens = tokenize(line)
                    lemmas = lemmatize(tokens, lemma_dict)

                    word_counts.update(lemmas)

        except Exception as e:
            print(f"There is an error processing file {filename}: {e}")

    return word_counts

def freq_writer(freq_list, filename):
    with open(filename, "w") as outf:
        outf.write("word\tfrequency")  

        for word, freq in freq_list:
            outf.write(f"\n{word}\t{freq}")  

lemma_dict = pickle.load(open("ant_lemmas.pickle", "rb"))

corp_files = "/Users/jinbaldick/Documents/PhD_Jin_Course information/Fall 2024/LC 596/Lab stuff/Assignment/NICT_JLE"  

#calculating word frequencies 
corpus_freq_dict = process_transcript_files(corp_files, lemma_dict)

if corpus_freq_dict:
    sorted_corpus_freq = sorted(corpus_freq_dict.items(), key=operator.itemgetter(1), reverse=True)
    freq_writer(sorted_corpus_freq, "final_corpus_freq_Jin.txt")

    ###print the top 10 most frequent words just to double check my code!(optional)
    for word, freq in sorted_corpus_freq[:10]:
        print(f"{word}\t{freq}")
else:
    print("There is no frequency data.")

i	88016
and	43130
be	42236
the	38247
to	34169
a	23724
it	17521
in	16166
you	14828
yes	14092
