### Topic: Latent Semantic Analysis

Team members:
*   Yash Khandelwal (PID: A00000000)



In [None]:
# importspip in
import numpy as np
import pandas as pd
import pprint
pp = pprint.PrettyPrinter(indent=4)
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

import warnings
warnings.filterwarnings('ignore')

### Load dataset and print basic stats

In [None]:
#Load dataset
from sklearn.datasets import fetch_20newsgroups
remove=('headers', 'footers', 'quotes')
dataset = fetch_20newsgroups(subset='train', remove=remove, shuffle = True)
dataset.keys()

In [None]:
# len of training and testing dataset
print("Length of data:", len(dataset.data))

In [None]:
# get the list of 20 labels
pp.pprint(dataset.target_names)

### Exploratory Data Analysis

In [None]:
# put it into dataframe
news_df = pd.DataFrame({'News': dataset.data,
                       'Target': dataset.target})

# get dimensions of data
news_df.shape

In [None]:
news_df.head()

In [None]:
news_df['Target_name'] = news_df['Target'].apply(lambda x: dataset.target_names[x])

In [None]:
news_df.head()

In [None]:
# plot distribution of topics in news data
fig = plt.figure(figsize=[10,7])
ax = sns.countplot(y=news_df['Target_name'], palette='rocket')
plt.title('Distribution of Topics')
plt.ylabel('Topics')
plt.xlabel('Count of topics')

### Text Preprocessing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# tokenize
# remove non alphabetic characters
# remove stopwords and lemmatize

In [None]:
import regex as re
def clean_text(sentence):
    # remove non alphabetic sequences
    pattern = re.compile(r'[^a-z]+')
    sentence = sentence.lower()
    sentence = pattern.sub(' ', sentence).strip()

    # Tokenize
    word_list = word_tokenize(sentence)

    # stop words
    stopwords_list = set(stopwords.words('english'))

    # remove stop words
    word_list = [word for word in word_list if word not in stopwords_list]

    # remove very small words, length < 3 as they don't contribute any useful information
    word_list = [word for word in word_list if len(word) > 2]

    # lemmatize
    lemma = WordNetLemmatizer()
    word_list = [lemma.lemmatize(word) for word in word_list]

    # list to sentence
    sentence = ' '.join(word_list)

    return sentence


In [None]:
tqdm.pandas()
# clean text data
news_df['News'] = news_df['News'].progress_apply(lambda x: clean_text(str(x)))

In [None]:
news_df.head()

In [None]:
# plot word count for news text
wordcloud = WordCloud(background_color='white',
                      max_words=200).generate(str(news_df['News']))
fig = plt.figure(figsize=[16,16])
plt.title('WordCloud of News')
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
# vectorize text data
tfid_vec = TfidfVectorizer(tokenizer=lambda x: str(x).split(), max_df=0.95, min_df=2)
X = tfid_vec.fit_transform(news_df['News'])
X.shape

### Latent Semantic Analysis

In [None]:
# create svd instance
svd_model = TruncatedSVD(n_components=20,
                         algorithm='randomized')

# fit model to data
svd_model.fit(X)

In [None]:
# topic word mapping martrix
svd_model.components_.shape

In [None]:
# document topic mapping matrix
doc_topic = svd_model.fit_transform(X)
doc_topic.shape

In [None]:
terms = tfid_vec.get_feature_names_out()

In [None]:
# function to map words to topics
def map_word2topic(components, terms):
    # create output series
    word2topics = pd.Series()

    for idx, component in enumerate(components):
        # map terms (words) with topic
        # which is probability of word given a topic P(w|t)
        term_topic = pd.Series(component, index=terms)
        # sort values based on probability
        term_topic.sort_values(ascending=False, inplace=True)
        # put result in series output
        word2topics['topic '+str(idx)] = list(term_topic.iloc[:5].index)

    return word2topics

In [None]:
word2topics = map_word2topic(svd_model.components_, terms)

# print topic results
print('Topics\t\tWords')
for idx, item in zip(word2topics.index, word2topics):
    print(idx,'\t',item)

In [None]:
def get_top3_topics(x):
    top3 = list(x.sort_values(ascending=False).head(3).index) + list(x.sort_values(ascending=False).head(3).values)
    return top3

# map top3 topic words to news document
def map_topicword2doc(model, X):
    # output data frame column list
    cols = ['topic_'+str(i+1)+'_name' for i in range(3)] + ['topic_'+str(i+1)+'_prob' for i in range(3)]
    # doc to topic mapping
    doc_topic = model.fit_transform(X)
    # list of topics
    topics = ['topic'+str(i) for i in range(20)]
    # doc topic data frame
    doc_topic_df = pd.DataFrame(doc_topic, columns=topics)
    # map top 3 topics to doc
    outdf = doc_topic_df.progress_apply(lambda x: get_top3_topics(x), axis=1)
    # outdf is a series of list
    # convert it to a data frame
    outdf = pd.DataFrame(dict(zip(outdf.index, outdf.values))).T
    outdf.columns = cols

    return outdf

In [None]:
top_topics = map_topicword2doc(svd_model, X)
news_topics = pd.concat([news_df, top_topics], axis=1)

In [None]:
# convert probability from string to float
news_topics = news_topics.infer_objects()

In [None]:
news_topics.head(2)

In [None]:
fig = plt.figure(figsize=[10,7])
ax = sns.countplot(y=news_topics['topic_1_name'], palette='rocket')
plt.title('Distribution of Topics 1')
plt.ylabel('Topics')
plt.xlabel('Count of topic 1')

In [None]:
fig = plt.figure(figsize=[10,7])
ax = sns.countplot(y=news_topics['topic_2_name'], palette='rocket')
plt.title('Distribution of Topics 2')
plt.ylabel('Topics')
plt.xlabel('Count of topic 2')

In [None]:
fig = plt.figure(figsize=[10,7])
ax = sns.countplot(y=news_topics['topic_3_name'], palette='rocket')
plt.title('Distribution of Topics 3')
plt.ylabel('Topics')
plt.xlabel('Count of topic 3')

In [None]:
X_topics = svd_model.fit_transform(X)

In [None]:
!pip install umap-learn

In [None]:
import umap
embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)
plt.figure(figsize=(12,6))
plt.scatter(embedding[:, 0], embedding[:, 1],
c = dataset.target,
s = 10, # size
edgecolor='none' )
plt.show()

### Latent Dirichlet Allocation (LDA)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=20,
                                     max_iter=20)

In [None]:
lda_model.fit(X)

In [None]:
doc_topic_lda = lda_model.transform(X)

In [None]:
word2topics_lda = map_word2topic(lda_model.components_, terms)

# print topic results
print('Topics\t\tWords')
for idx, item in zip(word2topics_lda.index, word2topics_lda):
    print(idx,'\t',item)

In [None]:
def get_top3_topics(x):
    top3 = list(x.sort_values(ascending=False).head(3).index) + list(x.sort_values(ascending=False).head(3).values)
    return top3

# map top3 topic words to news document
def map_topicword2doc(model, X):
    # output data frame column list
    cols = ['topic_'+str(i+1)+'_name' for i in range(3)] + ['topic_'+str(i+1)+'_prob' for i in range(3)]
    # doc to topic mapping
    doc_topic = model.fit_transform(X)
    # list of topics
    topics = ['topic'+str(i) for i in range(20)]
    # doc topic data frame
    doc_topic_df = pd.DataFrame(doc_topic, columns=topics)
    # map top 3 topics to doc
    outdf = doc_topic_df.progress_apply(lambda x: get_top3_topics(x), axis=1)
    # outdf is a series of list
    # convert it to a data frame
    outdf = pd.DataFrame(dict(zip(outdf.index, outdf.values))).T
    outdf.columns = cols

    return outdf

In [None]:
top_topics = map_topicword2doc(lda_model, X)
news_topics = pd.concat([news_df, top_topics], axis=1)

In [None]:
fig = plt.figure(figsize=[10,7])
ax = sns.countplot(y=news_topics['topic_1_name'], palette='rocket')
plt.title('Distribution of Topics 1')
plt.ylabel('Topics')
plt.xlabel('Count of topics')

In [None]:
fig = plt.figure(figsize=[10,7])
ax = sns.countplot(y=news_topics['topic_2_name'], palette='rocket')
plt.title('Distribution of Topics 2')
plt.ylabel('Topics')
plt.xlabel('Count of topics')

In [None]:
fig = plt.figure(figsize=[10,7])
ax = sns.countplot(y=news_topics['topic_3_name'], palette='rocket')
plt.title('Distribution of Topics 3')
plt.ylabel('Topics')
plt.xlabel('Count of topics')

In [None]:
import umap
embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)
plt.figure(figsize=(12,6))
plt.scatter(embedding[:, 0], embedding[:, 1],
c = dataset.target,
s = 10, # size
edgecolor='none' )
plt.show()