In [None]:
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pd.options.display.max_columns = 100


In [None]:
%%capture
!python -m textblob.download_corpora


In [None]:
sentence_1 = 'Jen is a good student.'
sentence_2 = 'Jen is also a great guitarist.'
sentence_3 = 'Good students can sometimes be good guitarists'


# Data Cleaning
We want to singularize guitarists and students.

In [None]:
sentence_3_tb = TextBlob(sentence_3) # Make a textblob so that we can singularize the word
sentence_3_singular = [x.singularize() for x in sentence_3_tb.words] # Singularize each word in the text
sentence_3_clean = ' '.join(sentence_3_singular) # Join it together into a single string
sentence_3_clean


'Good student can sometime be good guitarist'

## Bag of Words Using CountVectorizer

In [None]:
# Perform the count transformation
vectorizer = CountVectorizer(stop_words='english')
bow_vec = vectorizer.fit_transform([sentence_1, sentence_2, sentence_3_clean])
bow_vec


<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [None]:
bow_vec.toarray()


array([[1, 0, 0, 1, 1],
       [0, 1, 1, 1, 0],
       [2, 0, 1, 0, 1]])

In [None]:
# Print out results in a data frame
sent_df = pd.DataFrame(bow_vec.toarray(), columns = vectorizer.get_feature_names_out())
sent_df


Unnamed: 0,good,great,guitarist,jen,student
0,1,0,0,1,1
1,0,1,1,1,0
2,2,0,1,0,1


### Your Turn
1. Write 4 sentences of your choice.
2. Run the `CountVectorizer` on your sentences.
3. Print the results in a data frame.

In [None]:
# Solution 1


In [None]:
# Solution 2


In [None]:
# Solution 3


## TF-IDF

In [None]:
# Perform the TF-IDF transformation - Option 1 (TfidfVectorizer)
tf_idf_vec = TfidfVectorizer(stop_words = 'english')
tf_idf_jen = tf_idf_vec.fit_transform([sentence_1, sentence_2, sentence_3_clean])


In [None]:
print(sentence_1)
print(sentence_2)
print(sentence_3_clean)


Jen is a good student.
Jen is also a great guitarist.
Good student can sometime be good guitarist


In [None]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_jen.toarray(), columns = tf_idf_vec.get_feature_names_out())
tf_df.shape


(3, 5)

In [None]:
tf_df


Unnamed: 0,good,great,guitarist,jen,student
0,0.57735,0.0,0.0,0.57735,0.57735
1,0.0,0.680919,0.517856,0.517856,0.0
2,0.816497,0.0,0.408248,0.0,0.408248


In [None]:
# Perform the TF-IDF transformation - Option 2 (CountVectorizer + TfidfTransformer - better for large datasets)
tf_idf_tran = TfidfTransformer()
tf_idf_jen = tf_idf_tran.fit_transform(bow_vec)


In [None]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_jen.toarray(), columns = vectorizer.get_feature_names_out())
tf_df


Unnamed: 0,good,great,guitarist,jen,student
0,0.57735,0.0,0.0,0.57735,0.57735
1,0.0,0.680919,0.517856,0.517856,0.0
2,0.816497,0.0,0.408248,0.0,0.408248


In [None]:
# Get a data frame with the TF-IDF values sorted for document 0
df = pd.DataFrame(tf_idf_jen[0].T.todense(), index=tf_idf_vec.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df


Unnamed: 0,TF-IDF
good,0.57735
jen,0.57735
student,0.57735
great,0.0
guitarist,0.0


In [None]:
tf_df.T[[0]].sort_values([0], ascending=False)


Unnamed: 0,0
good,0.57735
jen,0.57735
student,0.57735
great,0.0
guitarist,0.0


### Your Turn
1. Use the `TfidfTransformer` to transform the bag of words matrix you created above to TF-IDF.
2. Print out the results in a data frame.

In [None]:
# Solution 1


In [None]:
# Solution 2
# Print out results in a dataframe


# Another Example - Using Wikipedia API

In [None]:
%%capture output
#install Wikipedia API
!pip3 install wikipedia-api


In [None]:
import wikipediaapi


In [None]:
# Pull out the popcorn page from wikipedia - https://en.wikipedia.org/wiki/Popcorn
topic = 'popcorn'
wikip = wikipediaapi.Wikipedia(user_agent = 'foobar')
page_ex = wikip.page(topic)
wiki_text = page_ex.text
wiki_text


'Popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated; the same names also refer to the foodstuff produced by the expansion.\nA popcorn kernel\'s strong hull contains the seed\'s hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated. Pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool.\nSome strains of corn (taxonomized as Zea mays) are cultivated specifically as popping corns. The Zea mays variety everta, a special kind of flint corn, is the most common of these.\nPopcorn is one of six major types of corn, which includes dent corn, flint corn, pod corn, flour corn, and sweet corn.\n\nHistory\nCorn was domesticated about 10,000 years ago, in what is now Mexico. Archaeologists discovered that people have known about popcorn for thousands of years. Fossil evidence from

### Clean the text - version 1

Using string replace.



In [None]:
# Replace newline chars with spaces before doing any processing. Strip the ' and "s" from possessives
wiki_text_clean = (
    wiki_text
    .replace("\n"," ")
    .replace("\'s",'')
    .replace('\'','')
)
wiki_text_clean


'Popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated; the same names also refer to the foodstuff produced by the expansion. A popcorn kernel strong hull contains the seed hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated. Pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool. Some strains of corn (taxonomized as Zea mays) are cultivated specifically as popping corns. The Zea mays variety everta, a special kind of flint corn, is the most common of these. Popcorn is one of six major types of corn, which includes dent corn, flint corn, pod corn, flour corn, and sweet corn.  History Corn was domesticated about 10,000 years ago, in what is now Mexico. Archaeologists discovered that people have known about popcorn for thousands of years. Fossil evidence from Peru sugges

### Clean the text - version 2

Using a for..loop and string replace.




In [None]:
wiki_text_clean = wiki_text.lower()
for c in ["\n", "\'s", "'", "  "]:
  wiki_text_clean = wiki_text_clean.replace(c," ")
wiki_text_clean

'popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated; the same names also refer to the foodstuff produced by the expansion. a popcorn kernel strong hull contains the seed hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated. pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool. some strains of corn (taxonomized as zea mays) are cultivated specifically as popping corns. the zea mays variety everta, a special kind of flint corn, is the most common of these. popcorn is one of six major types of corn, which includes dent corn, flint corn, pod corn, flour corn, and sweet corn. history corn was domesticated about 10,000 years ago, in what is now mexico. archaeologists discovered that people have known about popcorn for thousands of years. fossil evidence from peru suggest

### Clean the text - version 3

Using a regular expression.


In [None]:
import re

wiki_text_clean = re.sub(r'[^a-z0-9.,)( ]+|\b[a-z]\b',' ', wiki_text.lower())
wiki_text_clean

'popcorn (also called popped corn, popcorns, or pop corn) is a variety of corn kernel which expands and puffs up when heated  the same names also refer to the foodstuff produced by the expansion. a popcorn kernel s strong hull contains the seed s hard, starchy shell endosperm with 14 20  moisture, which turns to steam as the kernel is heated. pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool. some strains of corn (taxonomized as zea mays) are cultivated specifically as popping corns. the zea mays variety everta, a special kind of flint corn, is the most common of these. popcorn is one of six major types of corn, which includes dent corn, flint corn, pod corn, flour corn, and sweet corn. history corn was domesticated about 10,000 years ago, in what is now mexico. archaeologists discovered that people have known about popcorn for thousands of years. fossil evidence from peru sug

In [None]:
# Break up single string into separate sentences
wiki_blob = TextBlob(wiki_text_clean)
len(wiki_blob.sentences)


128

In [None]:
# Only look at first 5 sentences
my_sentences = wiki_blob.sentences[0:5]
my_sentences


[Sentence("Popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated; the same names also refer to the foodstuff produced by the expansion."),
 Sentence("A popcorn kernel strong hull contains the seed hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated."),
 Sentence("Pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool."),
 Sentence("Some strains of corn (taxonomized as Zea mays) are cultivated specifically as popping corns."),
 Sentence("The Zea mays variety everta, a special kind of flint corn, is the most common of these.")]

In [None]:
# Convert text blob sentences to strings
my_sentences_str = [ str(x) for x in my_sentences ]
my_sentences_str


['Popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated; the same names also refer to the foodstuff produced by the expansion.',
 'A popcorn kernel strong hull contains the seed hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated.',
 'Pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool.',
 'Some strains of corn (taxonomized as Zea mays) are cultivated specifically as popping corns.',
 'The Zea mays variety everta, a special kind of flint corn, is the most common of these.']

In [None]:
# Perform the TF-IDF Vectorization
tf_idf_vec = TfidfVectorizer(stop_words = 'english')
tf_idf_pop = tf_idf_vec.fit_transform(my_sentences_str)
tf_idf_pop.shape


(5, 54)

In [None]:
tf_idf_pop.transpose().shape


(54, 5)

In [None]:
tf_idf_vec.get_feature_names_out()


array(['14', '20', '50', 'allowing', 'build', 'called', 'common',
       'contains', 'continues', 'cool', 'corn', 'corns', 'cultivated',
       'endosperm', 'everta', 'expand', 'expands', 'expansion', 'flint',
       'foodstuff', 'forcefully', 'hard', 'heated', 'hull', 'kernel',
       'kind', 'mays', 'moisture', 'names', 'original', 'pop', 'popcorn',
       'popcorns', 'popped', 'popping', 'pressure', 'produced', 'puffs',
       'refer', 'ruptures', 'seed', 'shell', 'size', 'special',
       'specifically', 'starchy', 'steam', 'strains', 'strong',
       'taxonomized', 'times', 'turns', 'variety', 'zea'], dtype=object)

In [None]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_pop.toarray(), columns = tf_idf_vec.get_feature_names_out())
tf_df.transpose()


Unnamed: 0,0,1,2,3,4
14,0.0,0.257781,0.0,0.0,0.0
20,0.0,0.207976,0.212599,0.0,0.0
50,0.0,0.0,0.263512,0.0,0.0
allowing,0.0,0.0,0.263512,0.0,0.0
build,0.0,0.0,0.263512,0.0,0.0
called,0.239471,0.0,0.0,0.0,0.0
common,0.0,0.0,0.0,0.0,0.367576
contains,0.0,0.257781,0.0,0.0,0.0
continues,0.0,0.0,0.263512,0.0,0.0
cool,0.0,0.0,0.263512,0.0,0.0


In [None]:
# Get a data frame with the TF-IDF values sorted for document 0
df = pd.DataFrame(tf_idf_pop[0].T.todense(), index=tf_idf_vec.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df[:5]


Unnamed: 0,TF-IDF
corn,0.48113
expands,0.239471
popcorns,0.239471
pop,0.239471
popped,0.239471


In [None]:
tf_df.T[[0]].sort_values([0], ascending=False)[:5]


Unnamed: 0,0
corn,0.48113
expands,0.239471
popcorns,0.239471
pop,0.239471
popped,0.239471
