# [Topic Modelling]

## 1. Read Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/Users/uriko/Desktop/UoL/Term 3/data/final.csv')
df.head()

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
0,Cleopatra,4,4,5286746,98.9,2022-06-01,Cleopatra VII Thea Philopator (Koinē Greek: Κλ...
1,Top Gun: Maverick,1024,433,4956212,76.4,2022-06-01,Top Gun: Maverick is a 2022 American action dr...
2,Roe v. Wade,305,86,4399634,78.5,2022-06-01,"Roe v. Wade, 410 U.S. 113 (1973), was a landma..."
3,Stranger Things (season 4),0,0,4320822,78.3,2022-06-01,The fourth season of the American science fict...
4,Vikram (2022 film),525,110,4135394,87.2,2022-06-01,Vikram is a 2022 Indian Tamil-language action ...


## 2. Data Preprocessing

### Generally,
* Text Cleaning: Remove any irrlevant characters, HTML tags, and punctuation.
* Tokenisation: Split the summaries into individual words.
* Stop Words Removal: Remove common stop words that do not contribute to topic differentiation.
* Stemming/Lemmatisation: Reduce words to their base or root from.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [4]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/uriko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/uriko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [6]:
# Text Preprocessing Function
def preprocess_text(text):
    # Tokenization and lowercasing
    tokens = text.lower().split()
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [7]:
# Apply preprocessing to summaries
df['processed_summary'] = df['summary'].apply(preprocess_text)

# Apply preprocessing to titles
df['processed_title'] = df['Page'].apply(preprocess_text)

In [8]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary,processed_summary,processed_title
0,Cleopatra,4,4,5286746,98.9,2022-06-01,Cleopatra VII Thea Philopator (Koinē Greek: Κλ...,cleopatra vii thea philopator (koinē greek: κλ...,cleopatra
1,Top Gun: Maverick,1024,433,4956212,76.4,2022-06-01,Top Gun: Maverick is a 2022 American action dr...,top gun: maverick 2022 american action drama f...,top gun: maverick
2,Roe v. Wade,305,86,4399634,78.5,2022-06-01,"Roe v. Wade, 410 U.S. 113 (1973), was a landma...","roe v. wade, 410 u.s. 113 (1973), landmark dec...",roe v. wade
3,Stranger Things (season 4),0,0,4320822,78.3,2022-06-01,The fourth season of the American science fict...,fourth season american science fiction horror ...,stranger thing (season 4)
4,Vikram (2022 film),525,110,4135394,87.2,2022-06-01,Vikram is a 2022 Indian Tamil-language action ...,vikram 2022 indian tamil-language action thril...,vikram (2022 film)
...,...,...,...,...,...,...,...,...,...
5989,CEO,0,0,942147,1.7,2022-08-01,A chief executive officer (CEO) (chief executi...,chief executive officer (ceo) (chief executive...,ceo
5990,Chloe Lattanzi,28,21,935080,82.2,2022-08-01,"Chloe Rose Lattanzi (born January 17, 1986) is...","chloe rose lattanzi (born january 17, 1986) am...",chloe lattanzi
5991,Biagio da Cesena,9,8,933621,82.8,2022-08-01,"Biagio Martinelli (Cesena 1463 – Rome 1544), b...","biagio martinelli (cesena 1463 – rome 1544), b...",biagio da cesena
5992,Microsoft Windows,3,2,932748,7.1,2022-08-01,Microsoft Windows is a product line of proprie...,microsoft window product line proprietary grap...,microsoft window


## 3. Feature Extraction

### TF-IDF

In [9]:
# TF-IDF Vectorization for summaries
tfidf_vectorizer_summaries = TfidfVectorizer(max_features=1000)
tfidf_matrix_summaries = tfidf_vectorizer_summaries.fit_transform(df['processed_summary'])

In [10]:
# TF-IDF Vectorization for titles
tfidf_vectorizer_titles = TfidfVectorizer(max_features=1000)
tfidf_matrix_titles = tfidf_vectorizer_titles.fit_transform(df['processed_title'])

## 4. Topic Modelling Algorithms

### LDA

In [11]:
# LDA Topic Modeling for summaries
lda_model_summaries = LatentDirichletAllocation(n_components=10, random_state=42)
lda_matrix_summaries = lda_model_summaries.fit_transform(tfidf_matrix_summaries)

In [12]:
# LDA Topic Modeling for titles
lda_model_titles = LatentDirichletAllocation(n_components=10, random_state=42)
lda_matrix_titles = lda_model_titles.fit_transform(tfidf_matrix_titles)

In [13]:
# Display Topics Function
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        topics[topic_idx] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topics

In [14]:
# Get the top words for each topic for summaries
topics_summaries = display_topics(lda_model_summaries, tfidf_vectorizer_summaries.get_feature_names_out(), 10)
print("Topics from summaries:")
for topic, words in topics_summaries.items():
    print(f"Topic #{topic+1}: {' '.join(words)}")

Topics from summaries:
Topic #1: club wikipedia death league list article uefa ronaldo f5 player
Topic #2: xxx to may film brand mixed art union return state
Topic #3: film award role album actor series actress drama best born
Topic #4: film released directed grossing best million 2019 star screenplay review
Topic #5: google youtube user content service website facebook video internet billion
Topic #6: india state day russian war murder country death indian 000
Topic #7: team tournament world player cup championship football first event nba
Topic #8: series season television netflix marvel episode premiered created first drama
Topic #9: election president trump state party united elizabeth biden presidential war
Topic #10: covid cleopatra pandemic virus 19 bc case ptolemy caesar health


In [15]:
# Get the top words for each topic for titles
topics_titles = display_topics(lda_model_titles, tfidf_vectorizer_titles.get_feature_names_out(), 10)
print("\nTopics from titles:")
for topic, words in topics_titles.items():
    print(f"Topic #{topic+1}: {' '.join(words)}")


Topics from titles:
Topic #1: film musk elon sex black taylor andrew chatgpt oppenheimer swift
Topic #2: world 2020 google cup john fifa ufc coronavirus johnson translate
Topic #3: united state bible joe biden moon tv series ted edward
Topic #4: film 2022 list death 2021 marvel universe cinematic internet xxxtentacion
Topic #5: series tv league game premier last robert boy season 2019
Topic #6: cleopatra trump donald michael george tom f5 network gmail harris
Topic #7: india ronaldo cristiano facebook microsoft office presidential election covid 19
Topic #8: youtube ii xxxx elizabeth miniseries disaster football qanon west henry
Topic #9: film xxx 2023 adam ansel man series spider mandalorian beer
Topic #10: 2019 wikipedia film instagram super lionel messi david bowl death


## 5. Label Topic