In [16]:
import re
import numpy as np
import pandas as pd

In [17]:
import nltk
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.decomposition import LatentDirichletAllocation

In [18]:
DATA_PATH = "bible_and_quran.tsv"   
STOPWORDS_PATH = "stopwords.txt"    
N_TOPICS = 20 

In [19]:
def load_stopwords(path: str) -> set:
    stopwords_set = set()
    with open(path, encoding="utf-8") as f:
        for line in f:
            w = line.strip().lower()
            if w:
                stopwords_set.add(w)
    return stopwords_set


stop_words = load_stopwords(STOPWORDS_PATH)
print(f"Loaded {len(stop_words)} custom stopwords from {STOPWORDS_PATH}")


Loaded 570 custom stopwords from stopwords.txt


In [20]:
df = pd.read_csv(DATA_PATH, sep="\t", header=None, names=["corpus", "text"])
print("Loaded data shape:", df.shape)
print("Example rows:")
print(df.head(), "\n")

Loaded data shape: (30367, 2)
Example rows:
  corpus                                               text
0     OT  In the beginning God created the heavens and t...
1     OT  The earth was without form, and void; and dark...
2     OT  Then God said, "Let there be light"; and there...
3     OT  And God saw the light, that it was good; and G...
4     OT  God called the light Day, and the darkness He ... 



In [21]:
stemmer = PorterStemmer()

def preprocess(text: str) -> str:
  
    text = str(text)
    text = text.lower()
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = text.split()

    # Remove stopwords and very short tokens, then stem
    processed_tokens = []
    for tok in tokens:
        if tok in stop_words:
            continue
        if len(tok) <= 1:
            continue
        stemmed = stemmer.stem(tok)
        processed_tokens.append(stemmed)

    return " ".join(processed_tokens)


df["clean"] = df["text"].apply(preprocess)

print("After preprocessing, example cleaned verses:")
print(df[["corpus", "text", "clean"]].head(), "\n")

After preprocessing, example cleaned verses:
  corpus                                               text  \
0     OT  In the beginning God created the heavens and t...   
1     OT  The earth was without form, and void; and dark...   
2     OT  Then God said, "Let there be light"; and there...   
3     OT  And God saw the light, that it was good; and G...   
4     OT  God called the light Day, and the darkness He ...   

                                               clean  
0                       begin god creat heaven earth  
1  earth form void dark face deep spirit god hove...  
2                                    god light light  
3                god light good god divid light dark  
4   god call light day dark call night even morn day   



In [22]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["clean"])
feature_names = np.array(vectorizer.get_feature_names_out())

print("Document–term matrix shape:", X.shape)
print("Number of unique tokens:", len(feature_names), "\n")


Document–term matrix shape: (30367, 8765)
Number of unique tokens: 8765 



In [23]:
corpus_labels = sorted(df["corpus"].unique())
print("Detected corpus labels:", corpus_labels, "\n")

for corpus_label in corpus_labels:
    print(f"=== Computing MI and χ² for corpus: {corpus_label} ===")

    # Binary target: this corpus vs all others
    y_binary = (df["corpus"] == corpus_label).astype(int).values

    # Mutual Information 
    mi_scores = mutual_info_classif(
        X, y_binary, discrete_features=True, random_state=0
    )

    mi_df = pd.DataFrame({
        "token": feature_names,
        "score": mi_scores
    }).sort_values("score", ascending=False)

    # Chi-square
    chi2_scores, _ = chi2(X, y_binary)

    chi2_df = pd.DataFrame({
        "token": feature_names,
        "score": chi2_scores
    }).sort_values("score", ascending=False)

    # Save to CSV as required: token,score
    safe_label = corpus_label.lower().replace(" ", "_")
    mi_filename = f"mi_{safe_label}.csv"
    chi2_filename = f"chi2_{safe_label}.csv"

    mi_df.to_csv(mi_filename, index=False)
    chi2_df.to_csv(chi2_filename, index=False)

    print(f"Saved MI scores to   {mi_filename}")
    print(f"Saved χ² scores to   {chi2_filename}\n")

print("MI/χ² computation finished.\n")


Detected corpus labels: ['NT', 'OT', 'Quran'] 

=== Computing MI and χ² for corpus: NT ===
Saved MI scores to   mi_nt.csv
Saved χ² scores to   chi2_nt.csv

=== Computing MI and χ² for corpus: OT ===
Saved MI scores to   mi_ot.csv
Saved χ² scores to   chi2_ot.csv

=== Computing MI and χ² for corpus: Quran ===
Saved MI scores to   mi_quran.csv
Saved χ² scores to   chi2_quran.csv

MI/χ² computation finished.



In [24]:
print(f"Fitting LDA with {N_TOPICS} topics...")
lda = LatentDirichletAllocation(
    n_components=N_TOPICS,
    random_state=0,
    learning_method="batch"
)

doc_topic_probs = lda.fit_transform(X)
print("LDA fitted.\n")

Fitting LDA with 20 topics...
LDA fitted.



In [25]:
topic_avgs = {}
top_topic_index = {}

for corpus_label in corpus_labels:
    mask = (df["corpus"] == corpus_label).values
    avg = doc_topic_probs[mask].mean(axis=0)  # average over docs
    topic_avgs[corpus_label] = avg
    best_topic = int(np.argmax(avg))
    top_topic_index[corpus_label] = best_topic

    print(f"Corpus: {corpus_label}")
    print("  First few average topic scores:", np.round(avg[:5], 4))
    print(f"  Top topic index for this corpus: {best_topic}\n")

Corpus: NT
  First few average topic scores: [0.0266 0.0483 0.0281 0.0309 0.0449]
  Top topic index for this corpus: 5

Corpus: OT
  First few average topic scores: [0.0496 0.0392 0.0837 0.0561 0.0464]
  Top topic index for this corpus: 19

Corpus: Quran
  First few average topic scores: [0.0216 0.0163 0.0207 0.0164 0.0695]
  Top topic index for this corpus: 8



In [26]:
def top_tokens_for_topic(topic_idx: int, n: int = 10):
    topic_vector = lda.components_[topic_idx]
    top_ids = topic_vector.argsort()[::-1][:n]
    tokens = feature_names[top_ids]
    weights = topic_vector[top_ids]
    return tokens, weights


for corpus_label in corpus_labels:
    t_idx = top_topic_index[corpus_label]
    tokens, weights = top_tokens_for_topic(t_idx, n=10)

    print(f"=== Corpus: {corpus_label} — Dominant Topic {t_idx} ===")
    for tok, w in zip(tokens, weights):
        print(f"{tok:15s}  {w:.4f}")
    print()

    # Save to CSV for report tables
    safe_label = corpus_label.lower().replace(" ", "_")
    out_df = pd.DataFrame({"token": tokens, "weight": weights})
    out_df.to_csv(f"lda_top_topic_tokens_{safe_label}.csv", index=False)

print("LDA topic analysis complete.")
print("Top-topic tokens per corpus saved as lda_top_topic_tokens_<corpus>.csv")

=== Corpus: NT — Dominant Topic 5 ===
jesu             881.0184
thing            811.7532
god              605.4264
christ           501.3980
spirit           299.6900
faith            265.0224
work             236.4003
man              187.6697
discipl          180.6544
receiv           171.1061

=== Corpus: OT — Dominant Topic 19 ===
lord             1799.4951
god              880.4480
word             807.6088
hear             604.6053
sin              576.0461
command          563.7937
nt               408.1103
heart            400.7994
law              394.7199
israel           388.7234

=== Corpus: Quran — Dominant Topic 8 ===
god              3084.6533
lord             915.4582
peopl            523.8709
fear             494.8636
merci            453.4935
messeng          444.2564
worship          398.6101
believ           397.0213
forgiv           248.3855
seek             243.0494

LDA topic analysis complete.
Top-topic tokens per corpus saved as lda_top_topic_tokens_<corpus>.c