In [None]:
from langdetect import detect
from langdetect import LangDetectException

# Function to detect English documents
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        # Handle the exception for too short texts or other issues
        return False

# Filter out non-English documents
# english_evd_list = [doc for doc in evidence_df['evidence_text'] if is_english(doc)]
english_evidences = {}
for _, row in evidence_df.iterrows():
    if is_english(row['evidence_text']):
        english_evidences[row['evidence_id']] = row['evidence_text']

import json
with open("english_evidence.json" , "w") as file:
    json.dump(english_evidences, file)

In [None]:
short_evidence = {}
for evd_id, evd_text in english_evidence.items():
    if len(evd_text.split()) < 100:
        short_evidence[evd_id] = evd_text

In [None]:
# Invert the dictionary to remove duplicates
inverted_dict = {}
for key, value in evidence_climate_2.items():
    inverted_dict[value] = key  # This will overwrite the entry if the value (text) is duplicated

# Optionally, invert it back if you need original format with unique texts only
unique_evidence = {v: k for k, v in inverted_dict.items()}

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

In [None]:
nltk.download('wordnet')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v') # remove suffix 
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

In [None]:
custom_punctuation = "!?,.;:()"
english_pattern = re.compile(r'[a-zA-Z]')

def preprocessing(text):
    # lower case
    text = text.lower()
    tokenized_text = word_tokenizer.tokenize(text)
    pros_tokens = []
    for word in tokenized_text:
        word = lemmatize(word)
        # remove stopwords, punctuation, not English words
        if word not in (stopwords and custom_punctuation) and english_pattern.search(word):
            pros_tokens.append(word)
    pros_text = " ".join(pros_tokens)
    return pros_text

In [None]:
prosed_evidences = {}
for key, value in unique_evidence.items():
    new_text = preprocessing(value)
    prosed_evidences[key] = new_text

## Latent Dirichlet Allocation

In [None]:
from gensim import corpora, models

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(prepro_evd_texts)
# Convert dictionary into a bag-of-words corpus
corpus = [dictionary.doc2bow(doc) for doc in prepro_evd_texts]

In [None]:
# Train LDA model
lda_model = models.LdaModel(corpus, num_topics=30, id2word=dictionary, passes=15)
lda_model.save('lda_model.model')

In [None]:
lda_model = models.LdaModel.load('lda_model.model')

In [None]:
for i, topic in lda_model.show_topics(formatted=False, num_topics=lda_model.num_topics,num_words=20):
    print(f"Topic #{i}:", [word for word, prob in topic])

In [None]:
from collections import defaultdict

# Assuming `lda_model` is your trained LDA model and `corpus` is your BOW corpus
topic_usage = defaultdict(float)
document_count = 0

for bow_doc in corpus:
    document_count += 1
    for topic_num, prob in lda_model.get_document_topics(bow_doc, minimum_probability=0):
        topic_usage[topic_num] += prob

# Calculate average topic usage
for topic_num in topic_usage:
    topic_usage[topic_num] /= document_count

# Sort topics by usage
sorted_topics = sorted(topic_usage.items(), key=lambda x: x[1], reverse=True)

# Print sorted topics by frequency
for topic_num, avg_prob in sorted_topics:
    print(f"Topic #{topic_num} average proportion: {avg_prob}")

In [None]:
prob = 0.3
relevant_topic = []
for topic_num, avg_prob in sorted_topics:
    if avg_prob > 0.03:
        relevant_topic.append(topic_num)
relevant_topic

In [None]:
evidence_ids = list(prepro_evidence.keys())

In [None]:
def is_climate(lda_model,corpus,topic_ids, evidence_ids, prepro_evd_texts):
    filtered_texts = {}

    for doc_id, doc_bow in enumerate(corpus):
        doc_topics = lda_model.get_document_topics(doc_bow)
        if any(topic[0] in topic_ids for topic in doc_topics):
            filtered_texts[evidence_ids[doc_id]] = prepro_evd_texts[doc_id]

    return filtered_texts

In [None]:
unused_topic = [1,4, 5, 10,13, 16,29]
all_topic = list(range(0,30))
relevant_topic = [top for top in all_topic if top not in unused_topic]
# []

In [None]:
climate_texts = is_climate(lda_model, corpus, relevant_topic ,evidence_ids, prepro_evd_texts)

In [None]:
climate_evd = {}
for key, value in climate_texts.items():
    climate_evd[key] = " ".join(value)

In [None]:
prob = 0.3
relevant_topic = []
for topic_num, avg_prob in sorted_topics:
    if avg_prob > 0.03:
        relevant_topic.append(topic_num)
relevant_topic