In [1]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## Topic Modeling In N L P: Tf Idf - 2 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [2]:
%pip install gensim scipy==1.12

Note: you may need to restart the kernel to use updated packages.


In [3]:
# =================================================-
#### Slide 13/43: Load packages  ####

# Module in standard library
from pathlib import Path
from pprint import pprint

# 3rd party packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models

nltk.download(["punkt", "stopwords"])

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# =================================================-
#### Slide 14/43: Directory settings  ####

# Set 'main_dir' to location of the project folder
# home_dir = Path(".").resolve()
# main_dir = home_dir.parent.parent
# print(main_dir)
# data_dir = str(main_dir) + "/data"
# print(data_dir)

In [5]:
# =================================================-
#### Slide 17/43: Load data  ####

# Let's load and prepare the dataset for creating Document-Term Matrix
df = pd.read_csv("data/NYT_article_data.csv")
print(df.head())

                                             web_url  \
0  https://www.nytimes.com/reuters/2019/01/01/spo...   
1  https://www.nytimes.com/reuters/2019/01/01/wor...   
2  https://www.nytimes.com/aponline/2019/01/01/sp...   
3  https://www.nytimes.com/2019/01/09/arts/design...   
4  https://www.nytimes.com/aponline/2019/01/10/sp...   

                                            headline  \
0  Kyrgios, Murray Power Into Second Round in Bri...   
1  UK Police Treating Manchester Stabbing Attack ...   
2  Former NFL Player Wiley Talks Playoffs on Podc...   
3    After the Quake, Dana Schutz Gets Back to Work    
4  Ogunbowale Helps Irish Beat Cardinals in 1-2 S...   

                                             snippet  word_count  \
0  Nick Kyrgios started his Brisbane Open title d...         435   
1  British police confirmed on Tuesday they were ...          81   
2  Marcellus Wiley is still on the fence about le...         272   
3  Still reckoning with the fallout from her Emme...  

In [6]:
# =================================================-
#### Slide 18/43: Check for NAs  ####

# Print total number of NAs.
print(df["snippet"].isna().sum())
# Drop NAs if any.
df = df.dropna(subset=["snippet"]).reset_index(drop=True)
print(df["snippet"].isna().sum())
# Isolate the `snippet` column.
df_text = df["snippet"]

0
0


In [7]:
# =================================================-
#### Slide 19/43: Tokenization: split each document into words  ####

# Tokenize each document into a large list of tokenized documents.
df_tokenized = [word_tokenize(df_text[i]) for i in range(0, len(df_text))]

In [8]:
# =================================================-
#### Slide 21/43: Convert characters to lowercase  ####

# Let's take a look at the first tokenized document
document_words = df_tokenized[0]
print(document_words)
# 1. Convert to lowercase.
document_words = [word.lower() for word in document_words]
print(document_words[:10])

['Nick', 'Kyrgios', 'started', 'his', 'Brisbane', 'Open', 'title', 'defense', 'with', 'a', 'battling', '7-6', '(', '5', ')', '5-7', '7-6', '(', '5', ')', 'victory', 'over', 'American', 'Ryan', 'Harrison', 'in', 'the', 'opening', 'round', 'on', 'Tuesday', '.']
['nick', 'kyrgios', 'started', 'his', 'brisbane', 'open', 'title', 'defense', 'with', 'a']


In [9]:
# =================================================-
#### Slide 22/43: Remove stop words  ####

# 2. Remove stop words.
# Get common English stop words.
stop_words = stopwords.words("english")
print(stop_words[:10])
# Remove stop words.
document_words = [word for word in document_words if not word in stop_words]
print(document_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
['nick', 'kyrgios', 'started', 'brisbane', 'open', 'title', 'defense', 'battling', '7-6', '(']


In [10]:
# =================================================-
#### Slide 23/43: Remove non-alphabetical characters  ####

# 3. Remove punctuation and any non-alphabetical characters.
document_words = [word for word in document_words if word.isalpha()]
print(document_words[:10])

['nick', 'kyrgios', 'started', 'brisbane', 'open', 'title', 'defense', 'battling', 'victory', 'american']


In [11]:
# =================================================-
#### Slide 26/43: Stem words  ####

# 4. Stem words.
document_words = [PorterStemmer().stem(word) for word in document_words]
print(document_words[:10])

['nick', 'kyrgio', 'start', 'brisban', 'open', 'titl', 'defens', 'battl', 'victori', 'american']


In [12]:
# =================================================-
#### Slide 27/43: Clean the entire corpus  ####

# Create a list for clean documents.
df_clean = [None] * len(df_tokenized)
# Create a list of word counts for each clean document.
word_counts_per_document = [None] * len(df_tokenized)

# Process words in all documents.
for i in range(len(df_tokenized)):
    # 1. Convert to lowercase.
    df_clean[i] = [document.lower() for document in df_tokenized[i]]

    # 2. Remove stop words.
    df_clean[i] = [word for word in df_clean[i] if not word in stop_words]

    # 3. Remove punctuation and any non-alphabetical characters.
    df_clean[i] = [word for word in df_clean[i] if word.isalpha()]

    # 4. Stem words.
    df_clean[i] = [PorterStemmer().stem(word) for word in df_clean[i]]

    # Record the word count per document.
    word_counts_per_document[i] = len(df_clean[i])

In [13]:
# =================================================-
#### Slide 28/43: Clean the entire corpus (cont'd)  ####

# Convert word counts list and documents list to NumPy arrays.
word_counts_array = np.array(word_counts_per_document)
df_array = np.array(df_clean, dtype=object)

# Find indices of all documents where there are greater than or equal to 5 words.
valid_documents = np.where(word_counts_array >= 5)[0]

# Subset the df_array to keep only those where there are at least 5 words.
df_array = df_array[valid_documents]

# Convert the array back to a list.
df_clean = df_array.tolist()  # <- the processed documents

In [14]:
# =================================================-
#### Slide 30/43: Data for TF-IDF matrix   ####

print(df_clean[0:2])

[['nick', 'kyrgio', 'start', 'brisban', 'open', 'titl', 'defens', 'battl', 'victori', 'american', 'ryan', 'harrison', 'open', 'round', 'tuesday'], ['british', 'polic', 'confirm', 'tuesday', 'treat', 'stab', 'attack', 'injur', 'three', 'peopl', 'manchest', 'victoria', 'train', 'station', 'terrorist', 'investig', 'search', 'address', 'cheetham', 'hill', 'area', 'citi']]


In [15]:
# =================================================-
#### Slide 32/43: Create a dictionary of counts   ####

# Set the seed.
np.random.seed(1)
dictionary = gensim.corpora.Dictionary(df_clean)

# The obtain the first 10 items of the dictionary
from itertools import islice
list(islice(dictionary.itervalues(), 10))

['american',
 'battl',
 'brisban',
 'defens',
 'harrison',
 'kyrgio',
 'nick',
 'open',
 'round',
 'ryan']

In [16]:
# =================================================-
#### Slide 33/43: Create a dictionary of counts (cont'd)  ####

dictionary.filter_extremes(no_below=4, no_above=0.5)

# How many words are left in the dictionary?
len(dictionary)

201

In [17]:
# =================================================-
#### Slide 34/43: Document to bag-of-words  ####

# We use a list comprehension to transform each doc within our df_clean object.
bow_corpus = [dictionary.doc2bow(doc) for doc in df_clean]

# Let's look at the first document.
print(bow_corpus[0])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1)]


In [18]:
# =================================================-
#### Slide 35/43: Document to bag-of-words  ####

# Isolate the first document.
bow_doc_1 = bow_corpus[0]

# Iterate through each dictionary item using the index.
# Print out each actual word and how many times it appears.
for i in range(len(bow_doc_1)):
    print(
        f'Word {bow_doc_1[i][0]} '
        f'("{dictionary[bow_doc_1[i][0]]}") '
        f'appears {bow_doc_1[i][1]} time.'
    )

Word 0 ("american") appears 1 time.
Word 1 ("defens") appears 1 time.
Word 2 ("open") appears 2 time.
Word 3 ("round") appears 1 time.
Word 4 ("start") appears 1 time.
Word 5 ("tuesday") appears 1 time.
Word 6 ("victori") appears 1 time.


In [19]:
# =================================================-
#### Slide 37/43: Transform counts with TfidfModel  ####

# This is the transformation.
tfidf = models.TfidfModel(bow_corpus)

# Apply the transformation to the entire corpus.
corpus_tfidf = tfidf[bow_corpus]

# Preview TF-IDF scores for the first document.

next(iter(corpus_tfidf))

[(0, 0.31942373876087665),
 (1, 0.3549009519669791),
 (2, 0.6118718565633235),
 (3, 0.3549009519669791),
 (4, 0.3059359282816618),
 (5, 0.22829905152454918),
 (6, 0.3549009519669791)]

In [20]:
# =================================================-
#### Slide 40/43: Exercise  ####


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################