# Topic modelling: process of analysing large texts into different topics

## For example, we will use **Large_language_model**, a passage from wikipedia

In [1]:
from gensim import corpora, models, similarities    # Used for topic modelling
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from operator import itemgetter
import re
import csv
import pprint


In [2]:
# To get the data we will work on
import wikipedia

passage = wikipedia.page('Large_language_model').content
print(passage)

A large language model (LLM) is a type of machine learning model designed for natural language processing tasks such as language generation. LLMs are language models with many parameters, and are trained with self-supervised learning on a vast amount of text.
The largest and most capable LLMs are generative pretrained transformers (GPTs). Modern models can be fine-tuned for specific tasks or guided by prompt engineering. These models acquire predictive power regarding syntax, semantics, and ontologies inherent in human language corpora, but they also inherit inaccuracies and biases present in the data they are trained in.


== History ==

Before 2017, there were a few language models that were large as compared to capacities then available. In the 1990s, the IBM alignment models pioneered statistical language modelling. A smoothed n-gram model in 2001 trained on 0.3 billion words achieved state-of-the-art perplexity at the time. In the 2000s, as Internet use became prevalent, some rese

In [3]:
# Sentence tokenization
sent_passage = sent_tokenize(passage)
sent_passage


['A large language model (LLM) is a type of machine learning model designed for natural language processing tasks such as language generation.',
 'LLMs are language models with many parameters, and are trained with self-supervised learning on a vast amount of text.',
 'The largest and most capable LLMs are generative pretrained transformers (GPTs).',
 'Modern models can be fine-tuned for specific tasks or guided by prompt engineering.',
 'These models acquire predictive power regarding syntax, semantics, and ontologies inherent in human language corpora, but they also inherit inaccuracies and biases present in the data they are trained in.',
 '== History ==\n\nBefore 2017, there were a few language models that were large as compared to capacities then available.',
 'In the 1990s, the IBM alignment models pioneered statistical language modelling.',
 'A smoothed n-gram model in 2001 trained on 0.3 billion words achieved state-of-the-art perplexity at the time.',
 'In the 2000s, as Intern

In [4]:
# Word tokenization
puntuations = ".,()==\n''``%@"
word_passage = []
for sent in sent_passage:
    word_passage.extend(word_tokenize(sent))

for word in word_passage:
    if word in puntuations:
        word_passage.remove(word)

print(word_passage)

['A', 'large', 'language', 'model', 'LLM', 'is', 'a', 'type', 'of', 'machine', 'learning', 'model', 'designed', 'for', 'natural', 'language', 'processing', 'tasks', 'such', 'as', 'language', 'generation', 'LLMs', 'are', 'language', 'models', 'with', 'many', 'parameters', 'and', 'are', 'trained', 'with', 'self-supervised', 'learning', 'on', 'a', 'vast', 'amount', 'of', 'text', 'The', 'largest', 'and', 'most', 'capable', 'LLMs', 'are', 'generative', 'pretrained', 'transformers', 'GPTs', 'Modern', 'models', 'can', 'be', 'fine-tuned', 'for', 'specific', 'tasks', 'or', 'guided', 'by', 'prompt', 'engineering', 'These', 'models', 'acquire', 'predictive', 'power', 'regarding', 'syntax', 'semantics', 'and', 'ontologies', 'inherent', 'in', 'human', 'language', 'corpora', 'but', 'they', 'also', 'inherit', 'inaccuracies', 'and', 'biases', 'present', 'in', 'the', 'data', 'they', 'are', 'trained', 'in', 'History', 'Before', '2017', 'there', 'were', 'a', 'few', 'language', 'models', 'that', 'were', '

In [5]:
# POS TAGGING
tagged = pos_tag(word_passage)
print(tagged)

[('A', 'DT'), ('large', 'JJ'), ('language', 'NN'), ('model', 'NN'), ('LLM', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('type', 'NN'), ('of', 'IN'), ('machine', 'NN'), ('learning', 'VBG'), ('model', 'NN'), ('designed', 'VBN'), ('for', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'VBG'), ('tasks', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('language', 'NN'), ('generation', 'NN'), ('LLMs', 'NNP'), ('are', 'VBP'), ('language', 'NN'), ('models', 'NNS'), ('with', 'IN'), ('many', 'JJ'), ('parameters', 'NNS'), ('and', 'CC'), ('are', 'VBP'), ('trained', 'VBN'), ('with', 'IN'), ('self-supervised', 'JJ'), ('learning', 'VBG'), ('on', 'IN'), ('a', 'DT'), ('vast', 'JJ'), ('amount', 'NN'), ('of', 'IN'), ('text', 'NN'), ('The', 'DT'), ('largest', 'JJS'), ('and', 'CC'), ('most', 'RBS'), ('capable', 'JJ'), ('LLMs', 'NNP'), ('are', 'VBP'), ('generative', 'JJ'), ('pretrained', 'JJ'), ('transformers', 'NNS'), ('GPTs', 'NNP'), ('Modern', 'NNP'), ('models', 'NNS'), ('can', 'MD'), ('be', 'VB'), ('fine-t

In [6]:
# LEMMATIZATION WITH POS TAGS   # necessary because there are different variables of parts of speech, so the machine needs to identify different types of words
from nltk.corpus import wordnet

def getpos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''


In [7]:
stop_words= stopwords.words('english')
wordnet_lem = WordNetLemmatizer()
lemma_pos = []
for word,tag in tagged:
    if word not in stop_words and word.isalpha():        # That is if word not in stop_words BUT is an alphabet ...
        p = getpos(tag)
        if p != '':                                     # We don't want to tag empty space or words that's not among the tag above
            lemma_pos.append(wordnet_lem.lemmatize(word, pos = p))

print(lemma_pos)
# The data has been lemmatized based on their respective pos

['large', 'language', 'model', 'LLM', 'type', 'machine', 'learn', 'model', 'design', 'natural', 'language', 'process', 'task', 'language', 'generation', 'LLMs', 'language', 'model', 'many', 'parameter', 'train', 'learn', 'vast', 'amount', 'text', 'large', 'capable', 'LLMs', 'generative', 'pretrained', 'transformer', 'GPTs', 'Modern', 'model', 'specific', 'task', 'guide', 'prompt', 'engineering', 'model', 'acquire', 'predictive', 'power', 'regard', 'syntax', 'semantics', 'ontology', 'inherent', 'human', 'language', 'corpus', 'also', 'inherit', 'inaccuracy', 'bias', 'present', 'data', 'train', 'History', 'language', 'model', 'large', 'compare', 'capacity', 'available', 'IBM', 'alignment', 'model', 'pioneer', 'statistical', 'language', 'model', 'smoothed', 'model', 'train', 'word', 'achieve', 'perplexity', 'time', 'Internet', 'use', 'become', 'prevalent', 'researcher', 'construct', 'language', 'datasets', 'web', 'corpus', 'train', 'statistical', 'language', 'model', 'language', 'processin

In [8]:
# To execute topic modelling
id2word = corpora.Dictionary([lemma_pos])                #To convert words to numbers (cos that's what the algorithm understands)
corpus = [id2word.doc2bow([text]) for text in lemma_pos]

id_model = models.LdaModel(corpus= corpus,    #'LdaModel is the algorithm used for modelling
                           id2word=id2word,  # Maps each word to an id
                           num_topics = 5,   # Number of topics....we can experiment on any number of topics
                           random_state = 100,
                           update_every= 1,
                           chunksize=100,
                           passes = 10,
                           alpha= 'symmetric',
                           per_word_topics=True)

print(id_model.print_topics())
# Topic modelling gives you several words for each topic and you can decide what will be the best name for the topic from the list
# The numbers are the percentage of probability for each word belonging to the topiv

[(0, '0.045*"LLM" + 0.034*"data" + 0.027*"example" + 0.027*"LLMs" + 0.021*"text" + 0.021*"token" + 0.018*"option" + 0.018*"output" + 0.016*"political" + 0.011*"use"'), (1, '0.116*"model" + 0.059*"language" + 0.041*"Language" + 0.025*"Large" + 0.019*"A" + 0.017*"energy" + 0.011*"tendency" + 0.010*"certain" + 0.009*"dataset" + 0.009*"specific"'), (2, '0.037*"large" + 0.026*"training" + 0.017*"also" + 0.017*"demand" + 0.014*"include" + 0.014*"task" + 0.013*"content" + 0.013*"train" + 0.011*"datasets" + 0.010*"Challenges"'), (3, '0.038*"bias" + 0.024*"benchmark" + 0.021*"Models" + 0.019*"answer" + 0.019*"generate" + 0.014*"AI" + 0.012*"capability" + 0.011*"evaluation" + 0.010*"response" + 0.010*"Index"'), (4, '0.017*"gender" + 0.015*"question" + 0.014*"electricity" + 0.013*"al" + 0.012*"base" + 0.012*"create" + 0.011*"result" + 0.011*"prompt" + 0.010*"different" + 0.010*"Processing"')]


- ## Topic modelling gives you several words for each topic and you can decide what will be the best name for the topic from the list
- ## The numbers are the percentage of probability for each word belonging to the topic

# TO VISUALISE THE TOPIC   (There are many ways but let's use "pyLDAvis"

In [9]:
#!pip install pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
visual = pyLDAvis.gensim.prepare(id_model, corpus, id2word)
visual