# UAS Natural Language Processing
Evelyn

36230029

Alaniah Nisrina, B.Eng., M.Eng.

Sumber Dataset: Kaggle
https://www.kaggle.com/datasets/rtatman/blog-authorship-corpus

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from collections import Counter
import gdown

file_id = "1fXBwd58m-6UlIHRkggOqAPDmUOe3VjgF"
url = f"https://drive.google.com/uc?id={file_id}"
output = "hasil_download.csv"
gdown.download(url, output, quiet=False)
print("Download selesai! File tersimpan sebagai:", output)

In [None]:
df = pd.read_csv("hasil_download.csv")
df.head()

In [None]:
df.info()

# Jumlah baris dan kolom
print(f"Jumlah sample: {df.shape[0]}")
print(f"Jumlah fitur: {df.shape[1]}")

In [None]:
df.describe()

# Preprocessing Awal

In [None]:
print("\n=== Missing Value per Kolom ===")
print(df.isnull().sum())

In [None]:
# Drop fitur yang tidak digunakan dalam topic modelling
df = df.drop(columns=['id', 'gender', 'age', 'topic', 'sign', 'date'])

print("Kolom setelah drop:", df.columns)
print("Shape baru:", df.shape)
df.head()

In [None]:
# Hanya mengambil 75% data
df = df.sample(frac=0.75, random_state=42).reset_index(drop=True)
print("Shape setelah mengambil 75% data:", df.shape)
df.head()

In [None]:
print("\n=== Jumlah Duplikasi pada Kolom text ===")
print(df.duplicated(subset=["text"]).sum())

df = df.drop_duplicates(subset=["text"])
print("Shape setelah drop duplicates:", df.shape)

In [None]:
# Bersihkan whitespace / karakter kosong
df["text"] = df["text"].astype(str).str.strip()
df = df[df["text"].str.len() > 0]

In [None]:
#Recalculate text_len
df["text_len"] = df["text"].apply(lambda x: len(x.split()))
df = df[df["text_len"] > 5]

In [None]:
df["text_len"] = df["text"].astype(str).apply(lambda x: len(x.split()))
df["text_len"].describe()

In [None]:
# Buang teks dengan kurang dari 5 kata (noise untuk topic modelling)
df = df[df["text_len"] > 5]
print("Shape setelah buang teks pendek:", df.shape)

In [None]:
# TOP WORDS (RAW)

raw_text = " ".join(df["text"].astype(str))
raw_words = raw_text.split()

word_freq = Counter(raw_words).most_common(20)

words = [w for w, c in word_freq]
counts = [c for w, c in word_freq]

plt.figure(figsize=(10,5))
plt.bar(words, counts)
plt.xticks(rotation=45)
plt.title("Top 20 Kata Paling Sering Muncul (Raw Data)")
plt.show()

In [None]:
df["text"].sample(5, random_state=42)

# Preprocessing Utama

In [None]:
!pip install langdetect
import langdetect
import nltk
import spacy
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")

In [None]:
!pip install pandarallel
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

def detect_lang_fast(text):
    try:
        short_text = text[:300]
        return langdetect.detect(short_text)
    except:
        return "unknown"

df["lang"] = df["text"].parallel_apply(detect_lang_fast)
df = df[df["lang"] == "en"].reset_index(drop=True)

print("Sisa data setelah filter bahasa:", df.shape)

In [None]:
def clean_text(text):
    # 1. Lowercase (normalization)
    text = text.lower()

    # 2. Remove URLs (menghindari noise)
    text = re.sub(r'http\S+|www.\S+', '', text)

    # 3. Remove non-alphabetic characters
    # (angka & simbol biasanya tidak membantu)
    text = re.sub(r'[^a-z\s]', ' ', text)

    # 4. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text
df["clean_text"] = df["text"].apply(clean_text)

In [None]:
def remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return " ".join(filtered)

df["no_stopwords"] = df["clean_text"].apply(remove_stopwords)

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = text.split()
    lemmas = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lemmas)

df["lemma_text"] = df["no_stopwords"].apply(lemmatize_text)

In [None]:
df[["text", "clean_text", "no_stopwords", "lemma_text"]].head()

In [None]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,      # membatasi fitur agar efisien
    ngram_range=(1,2),      # unigram + bigram
    min_df=5,               # kata muncul minimal 5 dokumen
    max_df=0.8              # menghapus kata yg terlalu sering (>80%)
)

X_tfidf = tfidf.fit_transform(df["lemma_text"])
X_tfidf.shape

In [None]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_features=5000)
X_count = count_vect.fit_transform(df["lemma_text"])
X_count.shape

In [None]:
df

In [None]:
!pip install wordcloud
from wordcloud import WordCloud

all_words = ' '.join(df['lemma_text'])

wordcloud = WordCloud(width=1200, height=600, background_color='white').generate(all_words)

plt.figure(figsize=(14,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("WordCloud Setelah Preprocessing", fontsize=16)
plt.show()

In [None]:
words = all_words.split()
counter = Counter(words).most_common(20)

labels, counts = zip(*counter)

plt.figure(figsize=(10,5))
plt.bar(labels, counts)
plt.xticks(rotation=45)
plt.title("20 Kata Paling Sering Muncul Setelah Preprocessing")
plt.show()

In [None]:
from sklearn.decomposition import PCA

# Ambil sample agar PCA tidak berat
sample_size = 3000
idx = np.random.choice(X_tfidf.shape[0], sample_size, replace=False)

X_sample = X_tfidf[idx].toarray()

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_sample)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], s=8, alpha=0.6)
plt.title("PCA 2D Proyeksi TF-IDF (Sample 3000 Data)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# Modelling

In [None]:
from sklearn.model_selection import train_test_split

train_text, test_text = train_test_split(
    df["lemma_text"],
    test_size=0.2,
    random_state=42
)

print("Train size:", len(train_text))
print("Test size :", len(test_text))

# LATENT SEMANTIC ANALYSIS (LSA)

**Karakteristik Model**

Model linear algebra based, bukan probabilistic.

Menggunakan SVD untuk memecah matriks TF-IDF menjadi representasi konsep laten.

Cocok untuk teks panjang dan corpus besar.

Hasilnya berupa topik tunggal per komponen.

In [None]:
from sklearn.decomposition import TruncatedSVD

n_topics = 10

lsa_model = TruncatedSVD(
    n_components=n_topics,
    n_iter=10,
    random_state=42
)

lsa_train = lsa_model.fit_transform(X_tfidf)

In [None]:
terms = tfidf.get_feature_names_out()

def get_lsa_topics(model, terms, n=15):
    topics = []
    for idx, comp in enumerate(model.components_):
        top_indices = comp.argsort()[-n:]
        topics.append([terms[i] for i in top_indices])
    return topics

lsa_topics = get_lsa_topics(lsa_model, terms)

In [None]:
topic_range = [3,5,7,10,12,15]
explained_var = []

for t in topic_range:
    model = TruncatedSVD(n_components=t, random_state=42)
    model.fit(X_tfidf)
    explained_var.append(model.explained_variance_ratio_.sum())

In [None]:
plt.plot(topic_range, explained_var, marker='o')
plt.title("LSA Explained Variance vs Jumlah Topik")
plt.xlabel("Jumlah Topik")
plt.ylabel("Explained Variance Ratio")
plt.grid(True)
plt.show()

# LATENT DIRICHLET ALLOCATION (LDA)

**Karakteristik Model**

Model probabilistic generative:
“dokumen = campuran topik”, “topik = campuran kata”.

Cocok untuk interpretasi topik yang jelas.

Sangat bergantung pada n_components, alpha, beta.

In [None]:
# from sklearn.decomposition import MiniBatchDictionaryLearning
# from sklearn.decomposition import LatentDirichletAllocation
# from sklearn.decomposition import TruncatedSVD
# from sklearn.feature_extraction.text import CountVectorizer

# count_vect = CountVectorizer(
#     max_df=0.95,
#     min_df=5,
#     stop_words="english"
# )

# X_count = count_vect.fit_transform(df["lemma_text"])
# terms_count = count_vect.get_feature_names_out()

# # Removed TruncatedSVD here as LDA expects non-negative input, typically counts.
# # svd = TruncatedSVD(n_components=300, random_state=42)
# # X_reduced = svd.fit_transform(X_count)

# topic_range = [3, 5, 7, 10, 12, 15]
# scores = {}

# for t in topic_range:
#     lda_fast = LatentDirichletAllocation(
#         n_components=t,
#         learning_method='online',
#         max_iter=5,
#         batch_size=512,
#         random_state=42,
#         n_jobs=1
#     )
#     # Fit LDA directly on X_count (non-negative counts) instead of X_reduced
#     lda_fast.fit(X_count)
#     scores[t] = lda_fast.bound_

# print("Score setiap jumlah topik:")
# print(scores)

# best_k = max(scores, key=scores.get)
# print("Topik terbaik:", best_k)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

count_vect = CountVectorizer(
    max_df=0.95,
    min_df=5,
    stop_words="english"
)

X_count = count_vect.fit_transform(df["lemma_text"])
terms_count = count_vect.get_feature_names_out()

scores = {
    3: 4165.399211143927,
    5: 3983.28602610316,
    7: 3995.575750398486,
    10: 3930.827149296313,
    12: 3890.7390214349352,
    15: 3905.7690457214608
}

print("Score setiap jumlah topik:")
print(scores)

best_k = max(scores, key=scores.get)
print("\nTopik terbaik:", best_k)

In [None]:
lda_fast = LatentDirichletAllocation(
    n_components=best_k,
    learning_method='online',
    max_iter=10,
    batch_size=512,
    random_state=42,
    n_jobs=1
)
lda_fast.fit(X_count)

In [None]:
def get_lda_topics(model, terms, n=15):
    topics = []
    for comp in model.components_:
        top_idx = comp.argsort()[-n:]
        topics.append([terms[i] for i in top_idx])
    return topics

lda_topics_fast = get_lda_topics(lda_fast, terms_count)

print("\n=== TOPIK DARI LDA FAST ===")
for i, topic_words in enumerate(lda_topics_fast):
    print(f"Topic {i+1}: {topic_words}")

In [None]:
plt.plot(topic_range, list(scores.values()), marker="o")
plt.title("LDA Score (Bound) vs Jumlah Topik")
plt.xlabel("Jumlah Topik")
plt.ylabel("Bound Score (semakin tinggi semakin baik)")
plt.grid(True)
plt.show()

In [None]:
# Perbandingan LDA dan LSA
def plot_wordcloud(topic_words, title):
    wc = WordCloud(background_color="white", width=1000, height=500)
    wc.generate(" ".join(topic_words))

    plt.figure(figsize=(5,3))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.show()

print("=== WordCloud LSA Topics ===")
for i, topic_words in enumerate(lsa_topics):
    plot_wordcloud(topic_words, f"LSA Topic {i+1}")

print("=== WordCloud LDA Topics ===")
for i, topic_words in enumerate(lda_topics_fast):
    plot_wordcloud(topic_words, f"LDA Topic {i+1}")

# Evaluasi Model & Result Analysis

In [None]:
!pip install gensim

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora

cleaned_tokens = [doc.split() for doc in df["lemma_text"]]
id2word = corpora.Dictionary(cleaned_tokens)

def get_lsa_topics(lsa_model, vectorizer, topn=10):
    terms = vectorizer.get_feature_names_out()
    topics = []
    for comp in lsa_model.components_:
        words = [terms[i] for i in comp.argsort()[:-topn - 1:-1]]
        topics.append(words)
    return topics

lsa_topics = get_lsa_topics(lsa_model, tfidf)

coherence_lsa = CoherenceModel(
    topics=lsa_topics,
    texts=cleaned_tokens,
    dictionary=id2word,
    coherence='c_v'
).get_coherence()

print("LSA Coherence Score:", coherence_lsa)

# --- Variance Ratio (LSA metric) ---
variance_ratio = lsa_model.explained_variance_ratio_.sum()
print("LSA Explained Variance Ratio:", variance_ratio)

In [None]:
from gensim import matutils

corpus = matutils.Sparse2Corpus(X_count, documents_columns=False)

# Coherence
coherence_lda = CoherenceModel(
    topics=lda_topics_fast,
    texts=cleaned_tokens,
    dictionary=id2word,
    coherence='c_v'
).get_coherence()

print("LDA Coherence (C_v):", coherence_lda)

# Perplexity
perplexity = lda_fast.perplexity(X_count)
print("LDA Perplexity:", perplexity)

In [None]:
def plot_wordcloud(words, title):
    text = " ".join(words)
    wc = WordCloud(width=800, height=400).generate(text)
    plt.figure(figsize=(10,5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(title)
    plt.show()

# --- LSA Wordcloud ---
for i, topic_words in enumerate(lsa_topics):
    plot_wordcloud(topic_words, f"LSA Topic {i}")

# --- LDA Wordcloud ---
# The `get_lda_topics` function (defined in cell qpzDZCeWwm73) is used for sklearn's LDA model.
# It returns a list of lists, where each inner list contains the top words for a topic.
lda_topics_for_plot = get_lda_topics(lda_fast, terms_count)
for i, topic_words in enumerate(lda_topics_for_plot):
    plot_wordcloud(topic_words, f"LDA Topic {i}")

In [None]:
for i, topic in enumerate(lsa_topics):
    print(f"LSA Topic {i}: {topic}")

In [None]:
for i, topic_words in enumerate(lda_topics_fast):
    print(f"\nLDA Topic {i}:")
    print(topic_words)