# Working on Data I

### 2023-04-06

Today, we will focus on text. We will try to understand the concepts of N-grams, TF-IDF, and Word2Vec. We will also try to understand the concepts of topic modeling and text classification.

In [None]:
# Reading the CSV files

with open('/workspaces/eastd143b_text_analysis/data/people_daily_titles.txt', 'r') as f1, open('/workspaces/eastd143b_text_analysis/data/japan_times_titles.txt', 'r') as f2:
    # Reading the contents of the first file
    people_daily_titles = f1.readlines()

    # Reading the contents of the second file
    japan_times_titles = f2.readlines()


## WordCloud

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# Generating bigrams for the news titles from the People's Daily dataset
people_daily_vectorizer = CountVectorizer(ngram_range=(2, 2))
people_daily_bigrams = people_daily_vectorizer.fit_transform(
    people_daily_titles)


# Visualizing bigrams for the People's Daily dataset
people_daily_bigram_counts = people_daily_bigrams.sum(axis=0)
people_daily_bigram_dict = dict(zip(
    people_daily_vectorizer.vocabulary_.keys(), people_daily_bigram_counts.tolist()[0]))
people_daily_wordcloud = WordCloud(
    background_color='white').generate_from_frequencies(people_daily_bigram_dict)
plt.imshow(people_daily_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
# Generating bigrams for the news titles from the Japan Times dataset
japan_times_vectorizer = CountVectorizer(ngram_range=(2, 2))
japan_times_bigrams = japan_times_vectorizer.fit_transform(japan_times_titles)

# Visualizing bigrams for the Japan Times dataset
japan_times_bigram_counts = japan_times_bigrams.sum(axis=0)
japan_times_bigram_dict = dict(zip(
    japan_times_vectorizer.vocabulary_.keys(), japan_times_bigram_counts.tolist()[0]))
japan_times_wordcloud = WordCloud(
    background_color='white').generate_from_frequencies(japan_times_bigram_dict)
plt.imshow(japan_times_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
# Calculating term frequency (TF) for the news titles in the People's Daily dataset
people_daily_vectorizer = CountVectorizer(stop_words='english')
people_daily_tf = people_daily_vectorizer.fit_transform(people_daily_titles)

# Calculating term frequency (TF) for the news titles in the Japan Times dataset
japan_times_vectorizer = CountVectorizer(stop_words='english')
japan_times_tf = japan_times_vectorizer.fit_transform(japan_times_titles)

# Calculating document frequency (DF) for the news titles in the People's Daily dataset
people_daily_df = people_daily_tf.astype(bool).sum(axis=0)

# Calculating document frequency (DF) for the news titles in the Japan Times dataset
japan_times_df = japan_times_tf.astype(bool).sum(axis=0)

# Calculating term frequency-inverse document frequency (TF-IDF) for the news titles in the People's Daily dataset
people_daily_tfidf_vectorizer = TfidfVectorizer()
people_daily_tfidf = people_daily_tfidf_vectorizer.fit_transform(
    people_daily_titles)

# Calculating term frequency-inverse document frequency (TF-IDF) for the news titles in the Japan Times dataset
japan_times_tfidf_vectorizer = TfidfVectorizer()
japan_times_tfidf = japan_times_tfidf_vectorizer.fit_transform(
    japan_times_titles)


## Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA
import numpy as np

In [None]:

# Creating a TfidfVectorizer object with the same vocabulary for both datasets
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(people_daily_titles)
japan_times_tfidf = tfidf_vectorizer.transform(japan_times_titles)

# Concatenating the TF-IDF matrices of the two datasets
tfidf_concat = np.concatenate(
    [tfidf.toarray(), japan_times_tfidf.toarray()], axis=0)

# Performing PCA on the concatenated TF-IDF matrix
pca = PCA(n_components=2)
tfidf_pca = pca.fit_transform(tfidf_concat)

# Creating a scatter plot of the PCA results
plt.scatter(tfidf_pca[:len(people_daily_titles), 0], tfidf_pca[:len(
    people_daily_titles), 1], label='People Daily')
plt.scatter(tfidf_pca[len(people_daily_titles):, 0], tfidf_pca[len(
    people_daily_titles):, 1], label='Japan Times')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA of TF-IDF scores for People Daily and Japan Times')
plt.legend()
plt.show()

## Topic Modeling

In [None]:
from sklearn.decomposition import LatentDirichletAllocation


In [None]:
# Creating a CountVectorizer object to tokenize the news titles
people_daily_vectorizer = CountVectorizer(stop_words='english')
people_daily_counts = people_daily_vectorizer.fit_transform(
    people_daily_titles)

# Creating an LDA object to extract 10 topics from the news titles
lda = LatentDirichletAllocation(n_components=10, random_state=42)
people_daily_lda = lda.fit_transform(people_daily_counts)

# Creating a DataFrame to display the top words for each topic
people_daily_topics = pd.DataFrame(
    index=['Topic {}'.format(i) for i in range(1, 11)])

# Getting the top 10 words for each topic and adding them to the DataFrame
for i, topic in enumerate(lda.components_):
    people_daily_topics['Top Words for Topic {}'.format(
        i+1)] = [people_daily_vectorizer.get_feature_names_out()[j] for j in topic.argsort()[:-11:-1]]

# Displaying the DataFrame
display(people_daily_topics)


In [None]:
!pip install gensim
!pip install nltk

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import string
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()

    # Remove punctuation characters
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text into words
    words = word_tokenize(text)

    # Return the preprocessed text as a string
    return ' '.join(words)


In [None]:
import gensim
from gensim import corpora, models

# Preprocessing the data
people_daily_titles = [preprocess_text(title) for title in people_daily_titles]

# Converting the data into a numerical format
vectorizer = CountVectorizer(stop_words='english')
counts = vectorizer.fit_transform(people_daily_titles)

# Converting the counts into a Gensim-compatible format
corpus = gensim.matutils.Sparse2Corpus(counts, documents_columns=False)
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

# Performing LDA topic modeling
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus, id2word=id2word, num_topics=4, passes=10)

# Printing the topics
for idx, topic in lda_model.print_topics():
    print('Topic:', idx)
    print('Words:', topic)
    print()
