### Download Corpus

In [None]:
from convokit import Corpus, download, get_subreddit_info

In [None]:
uppercase_states = [
    'Alabama', 'Alaska', 'Arkansas', 'Arizona', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
    'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
    'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
    'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island',
    'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
    'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
]

In [None]:
lowercase_states = [state.lower() for state in uppercase_states]

In [None]:
corpus = {}

for index, state in enumerate(lowercase_states):
    concatenated_state = state.replace(" ", "")
    camel_case_state = ''.join(word.capitalize() for word in state.split())
    state_variants = [state, concatenated_state, camel_case_state]
    loaded = False 

    print(state_variants)

    for variant in state_variants:
        corpus_name = f'subreddit-{variant}'
        try:
            file = download(corpus_name)
            print(f"VARIANT USED: '{variant}'")
            loaded = True
            break
        except Exception as e:
            continue

    if not loaded:
        print(f"Failed to load corpus for {state} in all variants: last tried '{variant}'")


In [None]:
import os

save_directory = '/data/corpus'

os.makedirs(save_directory, exist_ok=True)

for state, corpus_obj in corpus.items():
    state_save_path = os.path.join(save_directory, state)

    corpus_obj.dump(name=state, base_path=save_directory)

### Train

In [None]:
from convokit import Corpus
import os

save_directory = '/models'

os.makedirs(save_directory, exist_ok=True)

corpus_load = {}

for state_dir in os.listdir(save_directory):
    state_path = os.path.join(save_directory, state_dir)

    loaded_corpus = Corpus(filename=state_path)
    corpus_load[state_dir] = loaded_corpus


In [None]:
import random
import spacy
from gensim.models import Word2Vec
import multiprocessing
import os
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def all_utterances(corpus):
    return [utt.text for utt in corpus.iter_utterances()]

def tokenize_and_remove_stopwords(text):
    if text.strip() in ["[deleted]", "[removed]"] or not text.strip():
        return []
    doc = nlp(text)
    filtered_tokens = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop and not token.is_punct and not token.like_url]
    return filtered_tokens

In [None]:
tokenized_dir = '/data/tokenized_corpus'
os.makedirs(tokenized_dir, exist_ok=True)

for state, state_corpus in corpus.items():

    tokenized_path = os.path.join(tokenized_dir, f"{state}_tokenized.txt")

    if os.path.exists(tokenized_path):
        print(f"Tokenized corpus for {state} already exists. Skipping...")
        continue

    print(f"Processing corpus for {state}...")
    utts = all_utterances(state_corpus)
    tokenized_utts = [tokenize_and_remove_stopwords(utt) for utt in utts]

    tokenized_utts = [tokens for tokens in tokenized_utts if tokens]

    with open(tokenized_path, 'w') as f:
        for tokens in tokenized_utts:
            f.write(" ".join(tokens) + "\n")


In [None]:
import os
import multiprocessing
from gensim.models import Word2Vec

word2vec_dir = 'models'
os.makedirs(word2vec_dir, exist_ok=True)

def read_tokenized_data(tokenized_dir, state):
    tokenized_path = os.path.join(tokenized_dir, f"{state}_tokenized.txt")
    with open(tokenized_path, 'r') as f:
        tokenized_data = [line.split() for line in f]
    return tokenized_data


state_data = {}

for state in corpus.keys():
    print(f"Training Word2Vec model for {state}...")

    tokenized_utts = read_tokenized_data(tokenized_dir, state)

    model = Word2Vec(
        tokenized_utts,
        vector_size=100,
        window=8,
        min_count=5,
        workers=multiprocessing.cpu_count()
    )
    state_data[state] = {
        'state_name': state,
        'word2vec_model': model
    }

    model_path = os.path.join(word2vec_dir, f"{state}.kv")
    model.wv.save(model_path)

for state, data in state_data.items():
    print(f"State: {state}, Model Info: {data['word2vec_model']}")