In [3]:
# its a type of unsupervised ml that identifies hidden topics in collection of text documents.
# by assuming each documents in mixture of topics,

In [4]:
# Latent Dirichet Allocation
# Non-Negative Matrix Factorization

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy

nlp = spacy.load("en_core_web_sm")

doc=["Cricket is a bat-and-ball game played between two teams of eleven players. It originated in England in the 16th century and is popular in many former colonies. The game is played on a field with a 22-yard pitch and a wicket at each end. One team bats and tries to score runs, while the other team fields and tries to prevent them. The game is divided into overs, each consisting oballsf six.",
 "Football, also called association football or soccer, is a game involving two teams of 11 players who try to maneuver the ball into the other team’s goal without using their hands or arms. The team that scores more goals wins. A football game is played between two teams of 11 players each. The game is on a rectangular pitch or field and none of the players can use their hands to touch the ball except for one goalkeeper in each team. A game of football lasts 90 minutes, divided into two halves of 45 minutes each."
]

In [23]:
# Create DataFrame to hold the manual documents
pd.set_option("display.max_colwidth",500)
data=pd.DataFrame({"text":doc})

In [24]:
data

Unnamed: 0,text
0,"Cricket is a bat-and-ball game played between two teams of eleven players. It originated in England in the 16th century and is popular in many former colonies. The game is played on a field with a 22-yard pitch and a wicket at each end. One team bats and tries to score runs, while the other team fields and tries to prevent them. The game is divided into overs, each consisting oballsf six."
1,"Football, also called association football or soccer, is a game involving two teams of 11 players who try to maneuver the ball into the other team’s goal without using their hands or arms. The team that scores more goals wins. A football game is played between two teams of 11 players each. The game is on a rectangular pitch or field and none of the players can use their hands to touch the ball except for one goalkeeper in each team. A game of football lasts 90 minutes, divided into two halve..."


In [25]:
# Preprocessing:Tokenization,stopwords removal and Lemmatization using spaCy
def preprocess(text):
    doc=nlp(text)
    processed_tokens=[token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(processed_tokens)

In [26]:
data['processed_text'] = data["text"].apply(preprocess)
data

Unnamed: 0,text,processed_text
0,"Cricket is a bat-and-ball game played between two teams of eleven players. It originated in England in the 16th century and is popular in many former colonies. The game is played on a field with a 22-yard pitch and a wicket at each end. One team bats and tries to score runs, while the other team fields and tries to prevent them. The game is divided into overs, each consisting oballsf six.",cricket bat ball game play team player originate England century popular colony game play field yard pitch wicket end team bat try score run team field try prevent game divide over consist oballsf
1,"Football, also called association football or soccer, is a game involving two teams of 11 players who try to maneuver the ball into the other team’s goal without using their hands or arms. The team that scores more goals wins. A football game is played between two teams of 11 players each. The game is on a rectangular pitch or field and none of the players can use their hands to touch the ball except for one goalkeeper in each team. A game of football lasts 90 minutes, divided into two halve...",football call association football soccer game involve team player try maneuver ball team goal hand arm team score goal win football game play team player game rectangular pitch field player use hand touch ball goalkeeper team game football last minute divide half minute


Create TF-IDF Vectorizer and Fit The Model

In [32]:
# Create TF-IDF Vectorizer 
vectorizer = TfidfVectorizer()

# Transform the manual doc into TF-IDF vectors
x=vectorizer.fit_transform(data["processed_text"])

# Create a Latent Dirichlet allocation model
lda = LatentDirichletAllocation(n_components=3)

# Fit model to TF-IDF vectors
lda.fit(x)

# Print the topic
print(lda.components_)

[[0.44963716 0.44963716 0.61853184 0.66579836 0.44963716 0.49858257
  0.49858257 0.49858257 0.49858257 0.5343935  0.49858257 0.49858257
  0.6534248  0.8044746  1.02520649 0.56771716 0.44963716 0.44963716
  0.56771716 0.44963716 0.44963716 0.44963716 0.56771716 0.49858257
  0.49858257 0.49858257 0.5343935  0.6534248  0.70274201 0.49858257
  0.49858257 0.44963716 0.49858257 0.5343935  0.44963716 1.10943983
  0.44963716 0.6534248  0.44963716 0.49858257 0.44963716 0.49858257]
 [0.33436435 0.33436435 0.33447798 0.33442273 0.33436435 0.33436962
  0.33436962 0.33436962 0.33436962 0.33443788 0.33436962 0.33436962
  0.33444775 0.33449503 0.33441028 0.33450746 0.33436435 0.33436435
  0.33450746 0.33436435 0.33436435 0.33436435 0.33450746 0.33436962
  0.33436962 0.33436962 0.33443788 0.33444775 0.33448218 0.33436962
  0.33436962 0.33436435 0.33436962 0.33443788 0.33436435 0.3344029
  0.33436435 0.33444775 0.33436435 0.33436962 0.33436435 0.33436962]
 [0.33436479 0.33436479 0.33447838 0.33442299 0

In [33]:
# Print Topics and their associated words
for idx,topic in enumerate(lda.components_):
    print(f"Topic {idx+1}:")
    #   Get top 5 words with highest weights for this topic
    top_words_idx=topic.argsort()[-5:][::-1]
    top_words=[vectorizer.get_feature_names_out()[i] for i in top_words_idx]
    print(",".join(top_words))
    print()

Topic 1:
team,game,football,player,bat

Topic 2:
hand,minute,goal,football,player

Topic 3:
hand,minute,goal,football,player

