In [1]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [2]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "AV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    text_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        text_out.append(final)
    return text_out

In [3]:
def preprocess_article(input_text):
    sentences_ted = []
    
    # Use regular expression to split the text into words
    sentences_ted = re.findall(r'\b\w+\b', input_text)
    sentences_ted = [token for token in sentences_ted if not token.isdigit()]


    # Load stopwords from a file into a set
    stoplist = set()
    with open('stopwords.txt') as openfileobject: 
        for line in openfileobject:
            stoplist.add(line.strip())  # Use strip() to remove leading/trailing whitespace
    
    cleaned_text = " ".join(word for word in sentences_ted if word not in stoplist)

    return cleaned_text

In [4]:
def gen_words(texts):
    final = [gensim.utils.simple_preprocess(text, deacc=True) for text in texts]
    return final

In [17]:
def compute_coherence_values(dictionary, corpus, tokenizedData, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        
        model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
        #model = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = num_topics, id2word=dictionary,random_state=100,passes=10)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, dictionary=dictionary, texts=tokenizedData, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [18]:
#this is the function that creates an LDA model from our dataframe

def create_lda_model(dataframe, topic_limit, topic_start, topic_step):
    #Pre-Process text grabbing
    rawData = []
    for index, row in df.iterrows(): #get all texts in this data structure
        rawData.append(row['Text'])
    
    #Lemmatize the texts
    lemmatizedData = lemmatization(rawData)
    
    #Removing Stop Words
    filteredData = [preprocess_article(x) for x in lemmatizedData]
    
    #Tokenize the text
    tokenizedData = gen_words(filteredData)
    
    #Create text dictionary
    id2word = corpora.Dictionary(tokenizedData)
    id2word.filter_extremes(no_below=0.1, no_above=0.9)
    
    #Create corpus
    corpus = [id2word.doc2bow(text) for text in tokenizedData]
    
    #Topic modeling using input values
    model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, tokenizedData=tokenizedData, limit=topic_limit, start=topic_start, step=topic_step)
    
    #Coherence score visualization
    x = range(topic_start, topic_limit, topic_step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

    #find the model with max coherence
    max_coherence_index = coherence_values.index(max(coherence_values))
    LDA_model = model_list[max_coherence_index]

    return LDA_model, corpus #this is our LDA model object that we will work with