Fetching and Cleaning

In [None]:
import sys, os, requests, pandas as pd
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from dataPipeline.services.nlp_engine import NLPEngine

# We fetch from the dedicated English endpoint
API_URL = "http://localhost:8000/api/retrieve/cmtsep/en"
response = requests.get(API_URL)
df = pd.DataFrame(response.json())

if df.empty:
    print("[!] No English comments found.")
else:
    # Double-check filter: ensure we only have 'en' and unprocessed rows
    unprocessed = df[
        (df['language'] == 'en') & 
        (df['cleaned_text'].isna() | (df['cleaned_text'] == ""))
    ].copy()

    if not unprocessed.empty:
        # 1. Cleaning
        unprocessed['token_list'] = NLPEngine.clean_comments(unprocessed['comment'])
        unprocessed['cleaned_text'] = unprocessed['token_list'].apply(lambda x: " ".join(x))
        
        # 2. Sentiment (New Step)
        unprocessed['sentiment'] = NLPEngine.analyze_sentiment_lstm(unprocessed['cleaned_text'])
        
        unprocessed = unprocessed[unprocessed['cleaned_text'] != ""].copy()
        print(f"[*] Prepared {len(unprocessed)} English comments for Topic Modeling.")
    else:
        print("[!] All English comments already processed.")

[*] Prepared 5 comments.


Topic Modeling Accuracy

In [None]:
if not unprocessed.empty:
    tokens = unprocessed['token_list'].tolist()
    tfidf_corpus, dictionary = NLPEngine.get_tfidf_corpus(tokens)
    winner_name, best_model, all_scores = NLPEngine.compare_models(tfidf_corpus, dictionary, tokens)

    print(f"Accuracy Scores: {all_scores}")
    print(f"Winner: {winner_name}")

    # Tag comments with Topic ID and Confidence
    topics_data = []
    for bow in tfidf_corpus:
        p = sorted(best_model[bow], key=lambda x: x[1], reverse=True)
        topics_data.append((f"{winner_name}_Topic_{p[0][0]}", float(p[0][1])))

    unprocessed['dominant_topic'], unprocessed['topic_confidence'] = zip(*topics_data)
    
    # Extract Vector Weights
    taxonomy_vectors = NLPEngine.extract_taxonomy(best_model)

Accuracy Scores: {'LDA': np.float64(0.5238345532119901), 'LSA': np.float64(0.5151676576329302), 'NMF': np.float64(0.5182506214086058)}
Best Model: LDA


Extract vectorized topic and tag comments

In [8]:
# 1. Tag each comment with its dominant topic from the winner
topics_data = []
for bow in tfidf_corpus:
    p = sorted(best_model[bow], key=lambda x: x[1], reverse=True)
    topics_data.append((f"Topic_{p[0][0]}", float(p[0][1])))

unprocessed['dominant_topic'], unprocessed['topic_confidence'] = zip(*topics_data)

# 2. Extract the Vectorized Taxonomy (Word Weights)
taxonomy_vectors = NLPEngine.extract_taxonomy(best_model)

# --- VERIFICATION STEP: SHOW THE VECTORS ---
print("\n--- VECTORIZED TOPIC TAXONOMY (Word Weights) ---")
tax_df = pd.DataFrame(taxonomy_vectors, columns=['Topic', 'Word', 'Weight'])
display(tax_df.head(15)) # This shows you the vectors in the notebook!


--- VECTORIZED TOPIC TAXONOMY (Word Weights) ---


Unnamed: 0,Topic,Word,Weight
0,Topic_0,way,0.066957
1,Topic_0,level,0.063868
2,Topic_0,teaching,0.063566
3,Topic_0,next,0.063544
4,Topic_0,supposed,0.059817
5,Topic_0,silly,0.059663
6,Topic_0,write,0.059598
7,Topic_0,ampersandinvertedquestionmarkcaretsevenfivetri...,0.059386
8,Topic_0,love,0.05923
9,Topic_0,teach,0.058566


DB fix

In [None]:
import psycopg2
from psycopg2.extras import execute_values

def persist_results(df, taxonomy):
    try:
        conn = psycopg2.connect(host="127.0.0.1", database="data_pipeline", user="admin", password="admin")
        cur = conn.cursor()
        
        # 1. Update Comments Table (Includes sentiment and cleaned_text)
        # Order: cleaned_text, dominant_topic, topic_confidence, sentiment, id
        comment_data = list(df[['cleaned_text', 'dominant_topic', 'topic_confidence', 'sentiment', 'id']].itertuples(index=False, name=None))
        
        update_sql = """
            UPDATE airflow.processed_comments 
            SET cleaned_text = val.t, 
                dominant_topic = val.tp, 
                topic_confidence = val.c,
                sentiment = val.s,
                updated_at = CURRENT_TIMESTAMP
            FROM (VALUES %s) AS val(t, tp, c, s, id)
            WHERE comment_id = val.id;
        """
        execute_values(cur, update_sql, comment_data)

        # 2. Update Taxonomy (Word Weights)
        cur.execute("CREATE TABLE IF NOT EXISTS airflow.topic_taxonomy (topic_id TEXT, word TEXT, weight FLOAT)")
        cur.execute("DELETE FROM airflow.topic_taxonomy")
        execute_values(cur, "INSERT INTO airflow.topic_taxonomy (topic_id, word, weight) VALUES %s", taxonomy)
        
        conn.commit()
        print("[SUCCESS] DB updated with English Sentiment and Topic Taxonomy.")
    except Exception as e:
        print(f"[X] DB Error: {e}")
    finally:
        if conn: conn.close()

if not unprocessed.empty:
    persist_results(unprocessed, taxonomy_vectors)

[SUCCESS] Database updated with Cleaned Text and Topic Vectors.
