In [None]:
# Import necessary libraries
import pandas as pd
import spacy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from wordcloud import WordCloud

# Load dataset
file_path = r"C:\\projects\\rows.csv"
df = pd.read_csv(file_path, low_memory=False)

print("\n🔹 Dataset Overview:")
df.info()
print("\n🔹 Sample Data:")
display(df.head())

# Handle missing values
df = df.dropna(subset=["Consumer complaint narrative"])
df.fillna({"Company public response": "No response", 
           "Tags": "No tags", 
           "Consumer consent provided?": "Unknown"}, inplace=True)

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocessing function
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

# Sample and preprocess
df_sample = df.sample(5000, random_state=42).copy()
df_sample["cleaned_text"] = df_sample["Consumer complaint narrative"].apply(preprocess_text)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_sample["cleaned_text"].dropna())

# spaCy Embeddings
nlp_md = spacy.load("en_core_web_md")

def get_spacy_embedding(text):
    doc = nlp_md(text)
    return doc.vector

df_sample = df_sample[df_sample["cleaned_text"].notna()].copy()
df_sample["spacy_embedding"] = df_sample["cleaned_text"].apply(get_spacy_embedding)
spacy_embeddings = np.vstack(df_sample["spacy_embedding"].values)

# Dimensionality Reduction (PCA)
pca_tfidf = PCA(n_components=2)
tfidf_2d = pca_tfidf.fit_transform(tfidf_matrix.toarray())

pca_spacy = PCA(n_components=2)
spacy_2d = pca_spacy.fit_transform(spacy_embeddings)

# Visualizations
plt.figure(figsize=(8, 6))
plt.scatter(tfidf_2d[:, 0], tfidf_2d[:, 1], alpha=0.5)
plt.title("TF-IDF Embeddings (PCA)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(spacy_2d[:, 0], spacy_2d[:, 1], alpha=0.5, color='red')
plt.title("spaCy Embeddings (PCA)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# Cosine Similarity
tfidf_cosine_sim = cosine_similarity(tfidf_matrix)
np.fill_diagonal(tfidf_cosine_sim, 0)
most_similar_idx = np.unravel_index(np.argmax(tfidf_cosine_sim), tfidf_cosine_sim.shape)

print(f"Most similar complaints (TF-IDF): {most_similar_idx}")
print(f"Complaint 1: {df_sample.iloc[most_similar_idx[0]]['cleaned_text']}")
print(f"Complaint 2: {df_sample.iloc[most_similar_idx[1]]['cleaned_text']}")

# Clustering with K-Means
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df_sample["cluster"] = kmeans.fit_predict(spacy_embeddings)

# Extract Top Words Per Cluster
clusters = df_sample["cluster"].values
cluster_tfidf = np.zeros((num_clusters, tfidf_matrix.shape[1]))

for i in range(num_clusters):
    cluster_mask = (df_sample["cluster"] == i).to_numpy()
    cluster_tfidf[i] = tfidf_matrix[cluster_mask].mean(axis=0).A1

top_n = 10
feature_names = tfidf_vectorizer.get_feature_names_out()
for i in range(num_clusters):
    top_words_idx = cluster_tfidf[i].argsort()[-top_n:][::-1]
    top_words = [feature_names[j] for j in top_words_idx]
    print(f"Cluster {i} top words: {', '.join(top_words)}")

# Optimal K for LDA
texts = df_sample["cleaned_text"].str.split()
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

def compute_coherence_values(dictionary, corpus, texts, start, limit, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=42,
            passes=10,
            per_word_topics=True
        )
        model_list.append(model)
        coherencemodel = CoherenceModel(
            model=model, texts=texts, dictionary=dictionary, coherence='c_v'
        )
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

start, limit, step = 2, 11, 1
model_list, coherence_values = compute_coherence_values(dictionary, corpus, texts, start, limit, step)

plt.figure(figsize=(8, 5))
plt.plot(range(start, limit, step), coherence_values, marker='o')
plt.xlabel("Num Topics (K)")
plt.ylabel("Coherence Score")
plt.title("LDA Coherence Score vs. Number of Topics")
plt.grid(True)
plt.show()

optimal_index = np.argmax(coherence_values)
optimal_k_lda = start + optimal_index * step
best_lda_model = model_list[optimal_index]
print(f"✅ Optimal K for LDA: {optimal_k_lda}")

# Optimal K for NMF
nmf_errors = []
k_range = range(2, 11)
for k in k_range:
    nmf_model = NMF(n_components=k, random_state=42, max_iter=1000)
    W = nmf_model.fit_transform(tfidf_matrix)
    error = nmf_model.reconstruction_err_
    nmf_errors.append(error)

plt.figure(figsize=(8, 5))
plt.plot(k_range, nmf_errors, marker='x', color='orange')
plt.xlabel("Num Topics (K)")
plt.ylabel("Reconstruction Error")
plt.title("NMF Error vs. Number of Topics")
plt.grid(True)
plt.show()

optimal_k_nmf = k_range[np.argmin(nmf_errors)]
print(f"✅ Optimal K for NMF: {optimal_k_nmf}")

# Display Topics
def display_topics(model, feature_names, num_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))

print(f"\n🔹 Final LDA Topics (K={optimal_k_lda}):")
for i, topic in enumerate(best_lda_model.show_topics(num_topics=optimal_k_lda, formatted=False)):
    print(f"\nTopic {i+1}:")
    print(", ".join([word for word, _ in topic[1]]))

print(f"\n🔹 Final NMF Topics (K={optimal_k_nmf}):")
nmf_final = NMF(n_components=optimal_k_nmf, random_state=42, max_iter=1000)
nmf_final.fit(tfidf_matrix)
display_topics(nmf_final, tfidf_vectorizer.get_feature_names_out())

# LDA Visualization
lda_vis = gensimvis.prepare(best_lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)