<a href="https://colab.research.google.com/github/TanyalaSrivatsava/NLP/blob/main/2403A51433_Lab_Assignment_06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LDA**

# **Task 1: Sample Data**

In [1]:
import pandas as pd

df = pd.read_excel("/content/Data.xlsx")
display(df.head())

Unnamed: 0,News
0,Virat scored century in match
1,BJP won in elections
2,Bumrah took 5 wickets in a match
3,Congress formed state government


In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    text = text.lower()

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokenized_words = word_tokenize(text)
    filtered_words = [word for word in tokenized_words if word not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...


NLTK preprocessing pipeline function created successfully!


[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
df['clean_news'] = df['News'].apply(nltk_preprocessing_pipeline)
display(df['clean_news'].head())

Unnamed: 0,clean_news
0,virat scored century match
1,bjp election
2,bumrah took 5 wicket match
3,congress formed state government


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(df['clean_news'])

In [5]:
import pandas as pd

feature_names = count_vectorizer.get_feature_names_out()
bow_df = pd.DataFrame(doc_term_matrix.toarray(), columns=feature_names)

bow_top_10 = bow_df.head(10)
display(bow_top_10)

Unnamed: 0,bjp,bumrah,century,congress,election,formed,government,match,scored,state,took,virat,wicket
0,0,0,1,0,0,0,0,1,1,0,0,1,0
1,1,0,0,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,1,0,1
3,0,0,0,1,0,1,1,0,0,1,0,0,0


In [6]:
from sklearn.decomposition import LatentDirichletAllocation

topics = 2
LDA = LatentDirichletAllocation(n_components=topics, random_state=42)
LDA.fit(doc_term_matrix)

In [7]:
def display_topics(model, feature_names, num_top_words):
  for topic_idx in range(len(model.components_)):
    print(f"\nTopic {topic_idx}")

    topic_weights = model.components_[topic_idx]
    sorted_indices = topic_weights.argsort()[::-1]
    top_indices = sorted_indices[:num_top_words]

    for idx in top_indices:
      print(feature_names[idx], end=" ")
    print()

In [8]:
num_top_words = 10
print(f"\nTop {num_top_words} words per topic:")
display_topics(LDA, count_vectorizer.get_feature_names_out(), num_top_words)


Top 10 words per topic:

Topic 0
formed government congress state election bjp match wicket bumrah took 

Topic 1
match virat century scored took bumrah wicket bjp election state 


In [9]:
document_topics = LDA.transform(doc_term_matrix)
df['topic'] = document_topics.argmax(axis=1)

print("\nDataFrame with assigned topics (first 5 rows):")
print(df[['clean_news', 'topic']].head())


DataFrame with assigned topics (first 5 rows):
                         clean_news  topic
0        virat scored century match      1
1                      bjp election      0
2        bumrah took 5 wicket match      1
3  congress formed state government      0


# **Task 2: Abstract Data**

In [10]:
import pandas as pd

df = pd.read_csv("/content/arxiv_data.csv", engine='python', nrows=1000)
display(df.head())

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [11]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    text = text.lower()

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokenized_words = word_tokenize(text)
    filtered_words = [word for word in tokenized_words if word not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")

NLTK preprocessing pipeline function created successfully!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
df['clean_summaries_pipeline'] = df['summaries'].apply(nltk_preprocessing_pipeline)
print("\nComparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):")
print(df[['clean_summaries_pipeline']].head())


Comparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):
                            clean_summaries_pipeline
0  stereo matching one widely used technique infe...
1  recent advancement artificial intelligence ai ...
2  paper proposed novel mutual consistency networ...
3  consistency training proven advanced semisuper...
4  ensure safety automated driving correct percep...


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(df['clean_summaries_pipeline'])

In [14]:
import pandas as pd

feature_names = count_vectorizer.get_feature_names_out()
bow_df = pd.DataFrame(doc_term_matrix.toarray(), columns=feature_names)
bow_top_10 = bow_df.head(10)
print(bow_top_10)

   01  011  014  049  059  060  065  084  089  091  ...  xray  xrays  year  \
0   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
1   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
2   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
3   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
4   0    0    0    0    0    0    0    0    0    0  ...     0      0     1   
5   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
6   0    0    0    0    0    0    0    0    0    0  ...     0      0     1   
7   0    0    0    0    0    0    0    0    0    0  ...     1      0     0   
8   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
9   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   

   yes  yield  yielded  yielding  youtube  youtubevos  zurich  
0    0      0        0         0        0           0       0  
1    0      0

In [15]:
from sklearn.decomposition import LatentDirichletAllocation

num_topics = 2
LDA = LatentDirichletAllocation(n_components=num_topics, random_state=42)
LDA.fit(doc_term_matrix)

In [16]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx in range(len(model.components_)):
        print(f"\nTopic {topic_idx}:")
        topic_weights = model.components_[topic_idx]
        sorted_indices = topic_weights.argsort()[::-1]
        top_indices = sorted_indices[:num_top_words]

        for idx in top_indices:
            print(feature_names[idx], end=" ")
        print()

In [17]:
num_top_words = 10
print(f"\nTop {num_top_words} words per topic:")
display_topics(LDA, count_vectorizer.get_feature_names_out(), num_top_words)


Top 10 words per topic:

Topic 0:
method network feature model proposed approach result algorithm based semantic 

Topic 1:
network method model learning data deep training medical task performance 


In [18]:
document_topics = LDA.transform(doc_term_matrix)
df['topic'] = document_topics.argmax(axis=1)

print("\nDataFrame with assigned topics (first 5 rows):")
print(df[['clean_summaries_pipeline', 'topic']].head())


DataFrame with assigned topics (first 5 rows):
                            clean_summaries_pipeline  topic
0  stereo matching one widely used technique infe...      1
1  recent advancement artificial intelligence ai ...      1
2  paper proposed novel mutual consistency networ...      1
3  consistency training proven advanced semisuper...      1
4  ensure safety automated driving correct percep...      1


# **NMF**

# **Task 1: Sample Data Using BoW**

In [19]:
import pandas as pd

df = pd.read_excel("/content/Data.xlsx")
display(df.head())

Unnamed: 0,News
0,Virat scored century in match
1,BJP won in elections
2,Bumrah took 5 wickets in a match
3,Congress formed state government


In [20]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    text = text.lower()

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokenized_words = word_tokenize(text)
    filtered_words = [word for word in tokenized_words if word not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")

NLTK preprocessing pipeline function created successfully!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
df['clean_news'] = df['News'].apply(nltk_preprocessing_pipeline)
display(df['clean_news'].head())

Unnamed: 0,clean_news
0,virat scored century match
1,bjp election
2,bumrah took 5 wicket match
3,congress formed state government


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(df['clean_news'])

In [23]:
import pandas as pd

feature_names = count_vectorizer.get_feature_names_out()
bow_df = pd.DataFrame(doc_term_matrix.toarray(), columns=feature_names)

bow_top_10 = bow_df.head(10)
display(bow_top_10)

Unnamed: 0,bjp,bumrah,century,congress,election,formed,government,match,scored,state,took,virat,wicket
0,0,0,1,0,0,0,0,1,1,0,0,1,0
1,1,0,0,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,1,0,1
3,0,0,0,1,0,1,1,0,0,1,0,0,0


In [24]:
from sklearn.decomposition import NMF

num_topics = 2
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(doc_term_matrix)
print("NMF model initialized and fitted successfully.")

NMF model initialized and fitted successfully.


In [25]:
print(f"\nTop {num_top_words} words per topic:")
display_topics(nmf_model, count_vectorizer.get_feature_names_out(), num_top_words)


Top 10 words per topic:

Topic 0:
match virat took scored wicket bumrah century bjp election state 

Topic 1:
state formed congress government election bjp took virat wicket scored 


In [26]:
document_topics = nmf_model.transform(doc_term_matrix)
df['topic'] = document_topics.argmax(axis=1)

print("\nDataFrame with assigned topics (first 5 rows):")
print(df[['clean_news', 'topic']].head())


DataFrame with assigned topics (first 5 rows):
                         clean_news  topic
0        virat scored century match      0
1                      bjp election      1
2        bumrah took 5 wicket match      0
3  congress formed state government      1


# **Task 2: Abstract Data Using BoW**

In [27]:
import pandas as pd

df = pd.read_csv("/content/arxiv_data.csv", engine='python', nrows=1000)
display(df.head())

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [28]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    text = text.lower()

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokenized_words = word_tokenize(text)
    filtered_words = [word for word in tokenized_words if word not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")

NLTK preprocessing pipeline function created successfully!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
df['clean_summaries_pipeline'] = df['summaries'].apply(nltk_preprocessing_pipeline)
print("\nComparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):")
print(df[['clean_summaries_pipeline']].head())


Comparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):
                            clean_summaries_pipeline
0  stereo matching one widely used technique infe...
1  recent advancement artificial intelligence ai ...
2  paper proposed novel mutual consistency networ...
3  consistency training proven advanced semisuper...
4  ensure safety automated driving correct percep...


In [30]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(df['clean_summaries_pipeline'])

In [31]:
import pandas as pd

feature_names = count_vectorizer.get_feature_names_out()
bow_df = pd.DataFrame(doc_term_matrix.toarray(), columns=feature_names)
bow_top_10 = bow_df.head(10)
print(bow_top_10)

   01  011  014  049  059  060  065  084  089  091  ...  xray  xrays  year  \
0   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
1   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
2   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
3   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
4   0    0    0    0    0    0    0    0    0    0  ...     0      0     1   
5   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
6   0    0    0    0    0    0    0    0    0    0  ...     0      0     1   
7   0    0    0    0    0    0    0    0    0    0  ...     1      0     0   
8   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
9   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   

   yes  yield  yielded  yielding  youtube  youtubevos  zurich  
0    0      0        0         0        0           0       0  
1    0      0

In [32]:
from sklearn.decomposition import NMF

num_topics = 2
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(doc_term_matrix)
print("NMF model initialized and fitted successfully.")

NMF model initialized and fitted successfully.


In [33]:
print(f"\nTop {num_top_words} words per topic:")
display_topics(nmf_model, count_vectorizer.get_feature_names_out(), num_top_words)


Top 10 words per topic:

Topic 0:
method model learning data training medical deep domain approach performance 

Topic 1:
network neural architecture feature task convolutional deep proposed propose performance 


In [34]:
document_topics = nmf_model.transform(doc_term_matrix)
df['topic'] = document_topics.argmax(axis=1)

print("\nDataFrame with assigned topics (first 5 rows):")
print(df[['clean_summaries_pipeline', 'topic']].head())


DataFrame with assigned topics (first 5 rows):
                            clean_summaries_pipeline  topic
0  stereo matching one widely used technique infe...      1
1  recent advancement artificial intelligence ai ...      0
2  paper proposed novel mutual consistency networ...      0
3  consistency training proven advanced semisuper...      0
4  ensure safety automated driving correct percep...      0


# **Task 3: Sample Data Using TF-IDF**

In [35]:
import pandas as pd
import nltk
import re
import string
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [36]:
import pandas as pd

df = pd.read_excel("/content/Data.xlsx")
display(df.head())

Unnamed: 0,News
0,Virat scored century in match
1,BJP won in elections
2,Bumrah took 5 wickets in a match
3,Congress formed state government


In [37]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
df['clean_news'] = df['News'].apply(preprocess_text)
print(df.head())

                               News                        clean_news
0     Virat scored century in match        virat scored century match
1              BJP won in elections                     bjp elections
2  Bumrah took 5 wickets in a match         bumrah took wickets match
3  Congress formed state government  congress formed state government


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_news'])

In [45]:
import pandas as pd

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

tfidf_top_10 = tfidf_df.head(10)
display(tfidf_top_10)

Unnamed: 0,bjp,bumrah,century,congress,elections,formed,government,match,scored,state,took,virat,wickets
0,0.0,0.0,0.525473,0.0,0.0,0.0,0.0,0.414289,0.525473,0.0,0.0,0.525473,0.0
1,0.707107,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.525473,0.0,0.0,0.0,0.0,0.0,0.414289,0.0,0.0,0.525473,0.0,0.525473
3,0.0,0.0,0.0,0.5,0.0,0.5,0.5,0.0,0.0,0.5,0.0,0.0,0.0


In [46]:
from sklearn.decomposition import NMF

num_topics = 2
nmf_model_tfidf = NMF(n_components=num_topics, random_state=42)
nmf_model_tfidf.fit(tfidf_matrix)
print("NMF model with TF-IDF initialized and fitted successfully.")

NMF model with TF-IDF initialized and fitted successfully.


In [47]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx in range(len(model.components_)):
        print(f"\nTopic {topic_idx}:")
        topic_weights = model.components_[topic_idx]
        sorted_indices = topic_weights.argsort()[::-1]
        top_indices = sorted_indices[:num_top_words]

        for idx in top_indices:
            print(feature_names[idx], end=" ")
        print()

In [48]:
num_top_words = 10
print(f"\nTop {num_top_words} words per topic:")
display_topics(nmf_model_tfidf, tfidf_vectorizer.get_feature_names_out(), num_top_words)


Top 10 words per topic:

Topic 0:
match bumrah wickets took virat scored century state government elections 

Topic 1:
elections bjp formed government state congress took virat wickets scored 


In [49]:
document_topics_tfidf = nmf_model_tfidf.transform(tfidf_matrix)
df['topic_tfidf'] = document_topics_tfidf.argmax(axis=1)

print("\nDataFrame with assigned topics (first 5 rows):")
print(df[['clean_news', 'topic_tfidf']].head())


DataFrame with assigned topics (first 5 rows):
                         clean_news  topic_tfidf
0        virat scored century match            0
1                     bjp elections            1
2         bumrah took wickets match            0
3  congress formed state government            1


# **Task 4: Abstract Data Using TF-IDF**

In [50]:
import pandas as pd

df = pd.read_csv("/content/arxiv_data.csv", engine='python', nrows=1000)
display(df.head())

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [51]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    text = text.lower()

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokenized_words = word_tokenize(text)
    filtered_words = [word for word in tokenized_words if word not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")

NLTK preprocessing pipeline function created successfully!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [52]:
df['clean_summaries_pipeline'] = df['summaries'].apply(nltk_preprocessing_pipeline)
print("\nComparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):")
print(df[['clean_summaries_pipeline']].head())


Comparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):
                            clean_summaries_pipeline
0  stereo matching one widely used technique infe...
1  recent advancement artificial intelligence ai ...
2  paper proposed novel mutual consistency networ...
3  consistency training proven advanced semisuper...
4  ensure safety automated driving correct percep...


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_summaries_pipeline'])

In [54]:
import pandas as pd

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

tfidf_top_10 = tfidf_df.head(10)
display(tfidf_top_10)

Unnamed: 0,01,011,014,049,059,060,065,084,089,091,...,xray,xrays,year,yes,yield,yielded,yielding,youtube,youtubevos,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.055707,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.073785,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.100085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
from sklearn.decomposition import NMF

num_topics = 2
nmf_model_tfidf = NMF(n_components=num_topics, random_state=42)
nmf_model_tfidf.fit(tfidf_matrix)
print("NMF model with TF-IDF initialized and fitted successfully.")

NMF model with TF-IDF initialized and fitted successfully.


In [56]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx in range(len(model.components_)):
        print(f"\nTopic {topic_idx}:")
        topic_weights = model.components_[topic_idx]
        sorted_indices = topic_weights.argsort()[::-1]
        top_indices = sorted_indices[:num_top_words]

        for idx in top_indices:
            print(feature_names[idx], end=" ")
        print()

In [57]:
num_top_words = 10
print(f"\nTop {num_top_words} words per topic:")
display_topics(nmf_model_tfidf, tfidf_vectorizer.get_feature_names_out(), num_top_words)


Top 10 words per topic:

Topic 0:
network method feature architecture model object proposed result approach 3d 

Topic 1:
domain data learning annotation training model label medical method labeled 


In [58]:
document_topics_tfidf = nmf_model_tfidf.transform(tfidf_matrix)
df['topic_tfidf'] = document_topics_tfidf.argmax(axis=1)

print("\nDataFrame with assigned topics (first 5 rows):")
print(df[['clean_summaries_pipeline', 'topic_tfidf']].head())


DataFrame with assigned topics (first 5 rows):
                            clean_summaries_pipeline  topic_tfidf
0  stereo matching one widely used technique infe...            0
1  recent advancement artificial intelligence ai ...            1
2  paper proposed novel mutual consistency networ...            1
3  consistency training proven advanced semisuper...            1
4  ensure safety automated driving correct percep...            0
