In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import json
import os
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Process textual data - split to sentences

In [2]:
df_docs = pd.read_csv('df_docs_eng.csv')
df_docs.rename(columns={'sentence': 'text'}, inplace=True)
df_docs

Unnamed: 0,document_id,text
0,0,Thomas Ibarra was a senior at the prestigious ...
1,1,Timothy Wilson was a senior at a prestigious u...
2,2,Nancy Brown was a sophomore at Iowa State Univ...
3,3,Donna Reyes was a senior at her university in ...
4,4,Mary Gonzales was a freshman at Ontario Univer...
...,...,...
23231,23231,Caitlin Orozco was a junior at a prestigious u...
23232,23232,Aaron Robinson was a freshman at the Universit...
23233,23233,Donald Hawkins DDS was a sophomore at the pres...
23234,23234,David Shelton was a junior at the University o...


In [3]:
# Create an empty list to hold rows
rows = []

# Tokenize each document into sentences
for i, document in df_docs.iterrows():
    for j, sentence in enumerate(sent_tokenize(document['text'])):
        rows.append({'document_id': i, 'sentence_id': j, 'sentence': sentence})

# Create final DataFrame
df = pd.DataFrame(rows)

# Preview the result
print(df.head())

   document_id  sentence_id                                           sentence
0            0            0  Thomas Ibarra was a senior at the prestigious ...
1            0            1  He was a diligent student, but his grades did ...
2            0            2  With a GPA of 2.83, Thomas was often overlooke...
3            0            3  But what they didn't know was that Thomas had ...
4            0            4  Born and raised in Massachusetts, Thomas had a...


Sentiment Analysis with LDA

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

def SentimentAnalysis_LDA(df_docs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Sentiment model
    model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        device=0 if device == "cuda" else -1,  # tell pipeline to use GPU if available
        return_all_scores=True
    )
    sentiment_scores = []
    sentences = df_docs['sentence'].tolist()

    # ⚠️ Process in batches if you have a large dataset to avoid memory issues
    for result in sentiment_pipeline(sentences, truncation=True, padding=True, max_length=512):
        sentiment_dict = {entry['label'].lower(): entry['score'] for entry in result}
        sentiment_scores.append({
            "negative": sentiment_dict.get("negative", 0.0),
            "neutral": sentiment_dict.get("neutral", 0.0),
            "positive": sentiment_dict.get("positive", 0.0)
        })

    df_lda = pd.concat([df_docs, pd.DataFrame(sentiment_scores)], axis=1)
    df_lda.to_csv('student_profiles_sentence_sentiment.csv', index=False)
    return df_lda

In [5]:
df_lda = SentimentAnalysis_LDA(df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Device set to use cuda:0


In [6]:
df_lda.head()

Unnamed: 0,document_id,sentence_id,sentence,negative,neutral,positive
0,0,0,Thomas Ibarra was a senior at the prestigious ...,0.033691,0.764755,0.201554
1,0,1,"He was a diligent student, but his grades did ...",0.594215,0.310364,0.095421
2,0,2,"With a GPA of 2.83, Thomas was often overlooke...",0.840266,0.134425,0.025308
3,0,3,But what they didn't know was that Thomas had ...,0.383633,0.535986,0.080381
4,0,4,"Born and raised in Massachusetts, Thomas had a...",0.056633,0.327467,0.6159


Topic Modeling with LDA

In [7]:
import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # For lemmatizer

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def clean_token(text):
    if not isinstance(text, str):
        return []
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens]
    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [8]:
def preprocess_data(original_df, required_columns):
    print('start preprocess_data for LDA')
    cleaned_df = original_df.dropna(subset=required_columns, how='all').copy()
    for column in required_columns:
        cleaned_df[f"{column}_tokens"] = cleaned_df[column].apply(clean_token)
    return cleaned_df

In [9]:
!pip install gensim
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

def identify_document_topics_lda(tokenized_texts, num_topics=100):
    dictionary = Dictionary(tokenized_texts)
    corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

    print(f"🧠 Training LDA model with {num_topics} topics...")
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
        alpha='auto'
    )

    # Assign dominant topic per document
    topic_numbers = []
    topic_probabilities = []

    for doc in corpus:
        doc_topics = lda_model.get_document_topics(doc)
        if doc_topics:
            top_topic = max(doc_topics, key=lambda x: x[1])
            topic_numbers.append(top_topic[0])
            topic_probabilities.append(top_topic[1])
        else:
            topic_numbers.append(-1)
            topic_probabilities.append(0.0)

    print("✅ Topic assignment complete.")
    return topic_numbers, topic_probabilities, lda_model, dictionary, corpus



In [10]:
def save_lda_topic_summary(lda_model, corpus, dictionary, filename="Topics_by_Docs_Info_LDA.xlsx"):
    num_topics = lda_model.num_topics

    # Count number of documents assigned to each topic
    topic_counts = [0] * num_topics
    for doc in corpus:
        topic_probs = lda_model.get_document_topics(doc)
        dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
        topic_counts[dominant_topic] += 1

    # Extract top words for each topic
    topic_word_info = []
    for topic_idx in range(num_topics):
        top_words = [word for word, prob in lda_model.show_topic(topic_idx, topn=10)]
        topic_word_info.append({
            "Topic Number": topic_idx,
            "Document Count": topic_counts[topic_idx],
            "Top Words": ", ".join(top_words)
        })

    # Save as Excel
    df_summary = pd.DataFrame(topic_word_info)
    df_summary.to_excel(filename, index=False)
    print(f"Gensim LDA topic summary saved to {filename}")

In [11]:
import pandas as pd
from gensim.models.coherencemodel import CoherenceModel

def analyze_topics(df):
  df_docs = preprocess_data(df, ['sentence'])
  print("start analyze_topics_in_data with LDA")
  tokenized_texts = df_docs['sentence_tokens'].tolist()
  topic_numbers, topic_probability, lda_model, dictionary, corpus = identify_document_topics_lda(tokenized_texts)
  df['Topic_number'] = topic_numbers
  df['Topic_probability'] = topic_probability
  df.rename(columns={'id': 'document_id'}, inplace=True)
  save_lda_topic_summary(lda_model, corpus, dictionary)
  # Evaluate coherence and diversity
  print("start Evaluate coherence and diversity with LDA")
  topic_words = [[word for word, _ in lda_model.show_topic(i)] for i in range(lda_model.num_topics)]
  coherence_model = CoherenceModel(topics=topic_words, texts=tokenized_texts, dictionary=dictionary,
                                         coherence='c_v')
  coherence_score = coherence_model.get_coherence()
  unique_words = set(word for topic in topic_words for word in topic)
  topic_diversity = len(unique_words) / (len(topic_words) * len(topic_words[0]))
  metrics_df = pd.DataFrame([{
      "coherence_score": coherence_score,
      "topic_diversity": topic_diversity
      }])
  metrics_df.to_csv("student_profiles_topic_evaluation_metrics_LDA.csv", index=False)
  df = df.drop(columns='sentence', errors='ignore')
  df.to_csv('student_profiles_Topics_LDA.csv', index=False)
  print("analyze_topics_in_data complete with LDA")
  return df

In [12]:
df_topic = analyze_topics(df)

start preprocess_data for LDA
start analyze_topics_in_data with LDA
🧠 Training LDA model with 100 topics...
✅ Topic assignment complete.
Gensim LDA topic summary saved to Topics_by_Docs_Info_LDA.xlsx
start Evaluate coherence and diversity with LDA
analyze_topics_in_data complete with LDA


Create Topics at the Document Level

In [13]:
import pandas as pd
df_topic = pd.read_csv('student_profiles_Topics_LDA.csv')
df_lda = pd.read_csv('student_profiles_sentence_sentiment.csv')

In [19]:
# Group by document_id and get the topic with the highest probability
df_topic = df_topic.loc[df_topic.groupby('document_id')['Topic_probability'].idxmax()]

#Select only the required columns
df_topic = df_topic[['document_id', 'Topic_number', 'Topic_probability']]

In [20]:
# Group by document_id and get the highest sentiment score for each category
df_lda = df_lda.groupby('document_id').agg(
    {'negative': 'max', 'neutral': 'max', 'positive': 'max'}
)

In [25]:
# Merge df_topic and df_lda
df_merged = pd.merge(df_topic, df_lda, on='document_id', how='inner')
df_merged

Unnamed: 0,document_id,Topic_number,Topic_probability,negative,neutral,positive
0,0,16,0.377311,0.840266,0.764755,0.873165
1,1,71,0.313457,0.587833,0.756060,0.893440
2,2,93,0.328709,0.382960,0.830119,0.924460
3,3,16,0.377269,0.581773,0.863945,0.870270
4,4,84,0.210042,0.840654,0.810793,0.823824
...,...,...,...,...,...,...
23231,23231,28,0.286856,0.619641,0.792907,0.893017
23232,23232,28,0.303970,0.399469,0.791283,0.917694
23233,23233,89,0.327002,0.680071,0.818212,0.920701
23234,23234,16,0.307594,0.187745,0.744787,0.828960


In [26]:
# Calculate the highest score among 'negative', 'neutral', and 'positive'
df_merged['Max_Sentiments_Score'] = df_merged[['negative', 'neutral', 'positive']].max(axis=1)

# Determine the corresponding sentiment label
df_merged['Sentiment'] = df_merged[['negative', 'neutral', 'positive']].idxmax(axis=1)
df_merged['Topic_probability'] = df_merged['Topic_probability'].round(3)
df_merged['negative'] = df_merged['negative'].round(3)
df_merged['neutral'] = df_merged['neutral'].round(3)
df_merged['positive'] = df_merged['positive'].round(3)
df_merged['Max_Sentiments_Score'] = df_merged['Max_Sentiments_Score'].round(3)
df_merged

Unnamed: 0,document_id,Topic_number,Topic_probability,negative,neutral,positive,Max_Sentiments_Score,Sentiment
0,0,16,0.377,0.840,0.765,0.873,0.873,positive
1,1,71,0.313,0.588,0.756,0.893,0.893,positive
2,2,93,0.329,0.383,0.830,0.924,0.924,positive
3,3,16,0.377,0.582,0.864,0.870,0.870,positive
4,4,84,0.210,0.841,0.811,0.824,0.841,negative
...,...,...,...,...,...,...,...,...
23231,23231,28,0.287,0.620,0.793,0.893,0.893,positive
23232,23232,28,0.304,0.399,0.791,0.918,0.918,positive
23233,23233,89,0.327,0.680,0.818,0.921,0.921,positive
23234,23234,16,0.308,0.188,0.745,0.829,0.829,positive


In [27]:
df_docs = pd.read_csv('df_docs_eng.csv')
df_docs.rename(columns={'sentence': 'text'}, inplace=True)

In [30]:
# Merge filtered_df and df_docs based on 'document_id'
df = pd.merge(df_merged, df_docs[['document_id', 'text']], on='document_id', how='inner')
cols = list(df.columns)
cols.insert(1, cols.pop(cols.index('text')))
df = df.loc[:, cols]
df.to_csv('student_profiles_documents_sentiment.csv', index=False, encoding="utf-8-sig", quoting=1)
df

Unnamed: 0,document_id,text,Topic_number,Topic_probability,negative,neutral,positive,Max_Sentiments_Score,Sentiment
0,0,Thomas Ibarra was a senior at the prestigious ...,16,0.377,0.840,0.765,0.873,0.873,positive
1,1,Timothy Wilson was a senior at a prestigious u...,71,0.313,0.588,0.756,0.893,0.893,positive
2,2,Nancy Brown was a sophomore at Iowa State Univ...,93,0.329,0.383,0.830,0.924,0.924,positive
3,3,Donna Reyes was a senior at her university in ...,16,0.377,0.582,0.864,0.870,0.870,positive
4,4,Mary Gonzales was a freshman at Ontario Univer...,84,0.210,0.841,0.811,0.824,0.841,negative
...,...,...,...,...,...,...,...,...,...
23231,23231,Caitlin Orozco was a junior at a prestigious u...,28,0.287,0.620,0.793,0.893,0.893,positive
23232,23232,Aaron Robinson was a freshman at the Universit...,28,0.304,0.399,0.791,0.918,0.918,positive
23233,23233,Donald Hawkins DDS was a sophomore at the pres...,89,0.327,0.680,0.818,0.921,0.921,positive
23234,23234,David Shelton was a junior at the University o...,16,0.308,0.188,0.745,0.829,0.829,positive
