# Session 5

Natural Language Processing (NLP)

Files used:
* AutoAndElectronics.zip
* FarmAds.csv


---


Michael de la Maza

AI/ML

Hult International business School

Adapted from "Data Mining for Business Analytics" by Shmueli

In [None]:
!pip install dmba

In [2]:
from zipfile import ZipFile
from sklearn.feature_extraction.text import CountVectorizer

#### Tokenization, Stemming, Stop Words

In [3]:
import pandas as pd

# Prints word frequencies in a document
def printTermDocumentMatrix(vectorizer, matrix):

    feature_names = vectorizer.get_feature_names_out()
    count_array = matrix.toarray()
    df = pd.DataFrame(data=count_array,columns = feature_names)

    print(df)

In [None]:
# Example: create count vectors

text = ['the cat and the dog are playing',
        'the cat is playing',
        'hi! :)',
        'i like ham & Eggs',
        'the chicken plays with the eggs?']

# Extract tokens from text
# A token is lower and upper case plus !,:,)
count_vect = CountVectorizer(token_pattern='[a-zA-Z!:)]+')
counts = count_vect.fit_transform(text)

printTermDocumentMatrix(count_vect, counts)

# Notice that 'eggs?' does not appear


In [5]:
# 5 minute exercise
#Try with different sentences. Do you get what you expect?



In [6]:
# 5 minute exercise
# Change CountVectorizer to include &. What does this do? Can you give an example?

In [None]:
# filter out stop words
count_vect = CountVectorizer(stop_words='english') # built in list of stop words
counts = count_vect.fit_transform(text)

printTermDocumentMatrix(count_vect, counts)

feature_names = count_vect.get_feature_names_out()
for i in range(counts.shape[0]): # will see a nicer way to do this later
    print(" ".join([feature_names[j] for j in counts[i].indices]))


In [None]:
# Stem words

from nltk.stem import PorterStemmer
ps = PorterStemmer()

print('eggs => ', ps.stem('eggs'))
print('running => ', ps.stem('running'))

for i in range(counts.shape[0]):
    words = [feature_names[j] for j in counts[i].indices]
    stemmed_words = [ps.stem(word) for word in words]
    stemmed_sentence = ' '.join(stemmed_words)
    print(stemmed_sentence)

In [9]:
# 5 minute exercise
# Use ps.stem to stem words of your choosing. Are any of the results unexpected?
# arguing
# arrangement & arrange
# people

#### Case study: Internet discussion posts

Classify posts as related to autos or electronics

In [None]:
import dmba

corpus = []
label = []
with ZipFile(dmba.get_data_file('AutoAndElectronics.zip')) as rawData:
    for info in rawData.infolist():
        if info.is_dir():
            continue
        label.append(1 if 'rec.autos' in info.filename else 0)
        corpus.append(rawData.read(info))

In [None]:
# print first item
corpus[0]

In [12]:
# 5 minute exercise
# Print first five items
# Indicate whether they are about 'autos' or 'electronics'

In [13]:
# Tokenize

count_vect = CountVectorizer(token_pattern='[a-zA-Z!:)]+', encoding='latin1')
counts = count_vect.fit_transform(corpus)

In [14]:
print(" ".join(count_vect.inverse_transform(counts[0])[0]))

path: cantaloupe srv cs cmu edu!das news harvard edu!ogicse!uwm edu!wupost!uunet!brunix!cs brown edu!cs from: edu hok chung tsang) newsgroups: rec autos subject: re: saturn s pricing policy message id: apr date: : gmt article i d references: c oxwp kkm cso uiuc vir l r shuksan ds boeing com sender: organization: computer science dept lines: in fredd fred dickey) writes: carolinafan cka uxa edu) wrote: have been active defending lately on the net and would like to state my full opinion subject rather than just reply others points biggest problem some people seem be having is that dealers make k a car think most will agree with me comparably priced its competitors they aren t overpriced compared cars their class don understand point of arguing over whether dealer makes or not never understood what big deal profits either only thing can figure out believe if minimize profit total pocket expenses for while this may true cases do it generally bought sl january at time based studying prices 

In [None]:
# Magic code that does all preprocessing at once - stem, stop, etc.
# Warning expected

import nltk # Natural Language Toolkit
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

class LemmaTokenizer(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        self.stopWords = set(ENGLISH_STOP_WORDS)

    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)
                if t.isalpha() and t not in self.stopWords]

# Learn features based on text
count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), encoding='latin1')
counts = count_vect.fit_transform(corpus)

# Print first
print(" ".join(count_vect.inverse_transform(counts[0])[0]))


In [16]:
# Use more sophisticated version of frequency count
# TFIDF - term frequency/ inverse document frequency
# Gives more weight to words that are unique to a document

from sklearn.feature_extraction.text import TfidfTransformer

tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(counts)

In [None]:
tfidf[0]

In [18]:
# Use only top 10 dimensions
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

svd = TruncatedSVD(10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

lsa_tfidf = lsa.fit_transform(tfidf)

In [None]:
lsa_tfidf[0]

In [None]:
# First five are all same category
label[0:5]

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from dmba import classificationSummary
from sklearn.neural_network import MLPClassifier

# Create training and testing set
Xtrain, Xtest, ytrain, ytest = train_test_split(lsa_tfidf, label, test_size=0.4, random_state=42)


# Initialize
nn = MLPClassifier(random_state=42)

# Fit model
nn.fit(Xtrain, ytrain)

classificationSummary(ytest, nn.predict(Xtest))




In [None]:
# Increase number of iterations (epochs)
nn_long = MLPClassifier(max_iter=500, random_state=42)

nn_long.fit(Xtrain, ytrain)

classificationSummary(ytest, nn_long.predict(Xtest))

In [23]:
# 5 minute exercise
# How many iterations are needed for it to converge?

In [None]:
# Can also try decision trees

# run decision tree model on training
dt = DecisionTreeClassifier(random_state=1)
dt.fit(Xtrain, ytrain)

# print confusion matrix and accuracty
classificationSummary(ytest, dt.predict(Xtest))

# run random forest classifier model on training
rf = RandomForestClassifier(n_estimators=500, random_state=1)
rf.fit(Xtrain, ytrain)

# print confusion matrix and accuracty
classificationSummary(ytest, rf.predict(Xtest))

In [None]:
# Decision trees slightly outperform neural networks. Can we improve the neural network?

# Two hidden layers with 8 neurons each
nn_2layer = MLPClassifier(hidden_layer_sizes=(8, 8), random_state=42)

# Fit the model to the training data
nn_2layer.fit(Xtrain, ytrain)

classificationSummary(ytest, nn_2layer.predict(Xtest))

In [26]:
# 5 minute Exercise
# Can you find a neural network architecture that has an accuracy greater than 96%?
# Vary the number of hidden layers and the number of neurons in each layer


#### Case study: Spam ads

You are running an online agriculture site. Your revenue primarily comes from ads. To maintain your ethical standards, you do not want to run ads that are spam.

Build a classifier that determines whether or not an ad is spam.

In [27]:
# Exercise: Examine farm-ads.csv.
# What do you see?
# What are the differences between spam ads (target: -1) and good ads (target: 1)?
# What are the spam ads about? What words do they have in common?
# What are the good ads about? What words do they have in common?

In [None]:
# Read in ads

import dmba

farm_ads = dmba.load_data('farm-ads.csv', names=['relevance', 'text'])

print('{} relevant ads'.format(len(farm_ads[farm_ads.relevance == 1])))
print(farm_ads[farm_ads.relevance == 1].head())

print('{} non-relevant ads'.format(len(farm_ads[farm_ads.relevance == -1])))
print(farm_ads[farm_ads.relevance == -1].head())

In [29]:
# Tokenize and create TFID

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer(token_pattern='[a-zA-Z-]+')
tfidfTransformer = TfidfTransformer(smooth_idf=False, norm=None)

counts = count_vect.fit_transform(farm_ads['text'])
tfidf = tfidfTransformer.fit_transform(counts)

In [30]:
# Latent Semantic Analysis - 20 concepts

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

svd = TruncatedSVD(20)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

lsa_tfidf = lsa.fit_transform(tfidf)

In [None]:
# Examine term-document matrix
# Remember that this matrix indicates which terms (tokens) are in which document

shape = counts.shape
print('Term-document matrix: {0[1]} terms, {0[0]} documents'.format(shape))
print('   Size of the matrix: {}'.format(counts.size))
print('  sparsity: {:.0f}%\n'.format(100 * counts.size / (shape[0] * shape[1])))

print(counts[:,0])

In [None]:
# Build intuition by examining terms

import pandas as pd

fromTerm = 5340
toTerm = 5350

index = count_vect.get_feature_names_out()[fromTerm:toTerm]
pd.DataFrame(data=counts[0:20,fromTerm:toTerm].toarray().transpose(), index=index)

In [None]:
# 'add' occurs three times in document 7

import numpy as np
print('Document 7 - term 5344: ', counts[7, 5344])
print('Document 8 - term 5344: ', counts[8, 5344])
print('Average occurrence of term 5344 in all documents:', np.sum(counts[:, 5344]) / counts.shape[0])

pd.Series([c.toarray()[0, 0] for c in counts[:, 5344]]).hist(bins=100)

In [34]:
# Exercise: Repeat this with a different term
# Comment on the distribution of the term in the documents

In [None]:
# Train neural network classifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from dmba import classificationSummary

train_X, valid_X, train_y, valid_y = train_test_split(lsa_tfidf, farm_ads.relevance, test_size=0.4, random_state=42)

neural_net = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=1000)

# Train the neural network on the training data
neural_net.fit(train_X, train_y)

# Print confusion matrix and accuracy on the validation set
classificationSummary(valid_y, neural_net.predict(valid_X), class_names=neural_net.classes_)

In [36]:
# 5 minute exercise
# Improve the classification performance by changing the number of parameters
# Try changing the number of hidden layers, the number of neurons in each layer

In [None]:
# Now let's try a simple decision tree
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=42)

decision_tree.fit(train_X, train_y)


classificationSummary(valid_y, decision_tree.predict(valid_X), class_names=decision_tree.classes_)

In [38]:
# Exercise: Improve the classification performance by changing the hyperparameters
# Try adjusting max_depth, min_samples_split


In [None]:
# Now we will try a Random Forest!
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(train_X, train_y)

classificationSummary(valid_y, random_forest.predict(valid_X), class_names=random_forest.classes_)

In [None]:
# Exercise: Improve the classification performance by changing the hyperparameters


#### Clustering



In [40]:
# Load data

from zipfile import ZipFile
import dmba

corpus = []
with ZipFile(dmba.get_data_file('AutoAndElectronics.zip')) as rawData:
    for info in rawData.infolist():
        if info.is_dir():
            continue
        corpus.append(rawData.read(info))

In [None]:
# Preprocessing: Stem, tokenize, stop words
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
import nltk
nltk.download('punkt')
from nltk import word_tokenize

class LemmaTokenizer(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        self.stopWords = set(ENGLISH_STOP_WORDS)

    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)
                if t.isalpha() and t not in self.stopWords]

preprocessor = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', encoding='latin1')
preprocessedText = preprocessor.fit_transform(corpus)

In [42]:
# Latent Semantic Analysis

from sklearn.feature_extraction.text import TfidfTransformer

tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(preprocessedText)

In [43]:
# Create 10 concepts

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

svd = TruncatedSVD(10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

lsa_tfidf = lsa.fit_transform(tfidf)

In [44]:
# Hierarchical clustering

from scipy.cluster.hierarchy import linkage

Z = linkage(lsa_tfidf, method='average')

In [None]:
# Plot hierarchical cluster
# Takes about 15 seconds
import matplotlib.pylab as plt
from scipy.cluster.hierarchy import dendrogram

fig = plt.figure(figsize=(10, 6))
fig.subplots_adjust(bottom=0.23)
plt.title('Hierarchical Clustering Dendrogram (Complete linkage)')
plt.xlabel('Documents')
dendrogram(Z, color_threshold=0.9)
plt.axhline(y=8, color='black', linewidth=0.5, linestyle='dashed')
plt.show()

# Notice that there are two large clusters, corresponding to auto and electronics?

In [None]:
# Examine clusters
from scipy.cluster.hierarchy import fcluster

# Recluster with 20 clusters
nclusters = 20
membership = fcluster(Z, nclusters, criterion='maxclust')
for clNumber in range(1, nclusters + 1):
    nmembers = sum(membership == clNumber)
    recAutos = ['Newsgroups: rec.autos' in str(doc) for doc, cl in zip(corpus, membership) if cl == clNumber]
    ratioAutos = sum(recAutos) /nmembers
    print(f'{sum(recAutos):3d} of {nmembers:3d} : {ratioAutos:.2f} {"rec.autos" if ratioAutos > 0.9 else ""}')

In [47]:
# K-means clustering with k=2
from sklearn.cluster import KMeans
import pandas as pd
import math

kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto').fit(lsa_tfidf)



In [None]:
# Between cluster distance
centroids = pd.DataFrame(kmeans.cluster_centers_)
print('Between-cluster distance: ', math.sqrt(sum(centroids.iloc[0, :] - centroids.iloc[1, :])**2))

In [None]:
# Intra cluster distance

withinClusterSS = [0] * 2
clusterCount = [0] * 2
for cluster, distance in zip(kmeans.labels_, kmeans.transform(lsa_tfidf)):
    withinClusterSS[cluster] += distance[cluster]**2
    clusterCount[cluster] += 1
for cluster, withClustSS in enumerate(withinClusterSS):
    count = clusterCount[cluster]
    withinClusterDispersion = math.sqrt(withClustSS / (count - 1))
    print(f'Cluster {cluster} ({count} members): {withinClusterDispersion:5.2f} within cluster')