In [1]:
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pd.options.display.max_columns = 100

import nltk
# nltk.download('omw-1.4')
nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
%%capture
!python -m textblob.download_corpora


In [3]:
sentence_1 = 'Jen is a good student.'
sentence_2 = 'Jen is also a great guitarist.'
sentence_3 = 'Good students can sometimes be good guitarists'


# Data Cleaning
We want to singularize guitarists and students.

In [4]:
sentence_3_tb = TextBlob(sentence_3) # Make a textblob so that we can singularize the word
sentence_3_singular = [x.singularize() for x in sentence_3_tb.words] # Singularize each word in the text
sentence_3_clean = ' '.join(sentence_3_singular) # Join it together into a single string
sentence_3_clean


'Good student can sometime be good guitarist'

## Bag of Words Using CountVectorizer

In [5]:
# Perform the count transformation
vectorizer = CountVectorizer(stop_words='english')
bow_vec = vectorizer.fit_transform([sentence_1, sentence_2, sentence_3_clean])
bow_vec


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (3, 5)>

In [6]:
bow_vec.toarray()


array([[1, 0, 0, 1, 1],
       [0, 1, 1, 1, 0],
       [2, 0, 1, 0, 1]])

In [7]:
# Print out results in a data frame
sent_df = pd.DataFrame(bow_vec.toarray(), columns = vectorizer.get_feature_names_out())
sent_df


Unnamed: 0,good,great,guitarist,jen,student
0,1,0,0,1,1
1,0,1,1,1,0
2,2,0,1,0,1


### Your Turn
1. Write 4 sentences of your choice.
2. Run the `CountVectorizer` on your sentences.
3. Print the results in a data frame.

In [8]:
# Solution 1
my_sents = [
    "It was the best of times.",
    "Call me Ishmael.",
    "To be, or not to be.  That is the question.",
    "We do not choose these things because they are easy."
]
my_sents

['It was the best of times.',
 'Call me Ishmael.',
 'To be, or not to be.  That is the question.',
 'We do not choose these things because they are easy.']

In [9]:
# Solution 2
my_vectorizer = CountVectorizer(stop_words='english')
my_bow_vec = my_vectorizer.fit_transform( my_sents )
my_bow_vec.toarray()


array([[1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 1, 0]])

In [10]:
# Solution 3
my_sent_df = pd.DataFrame(my_bow_vec.toarray(), columns = my_vectorizer.get_feature_names_out())
my_sent_df


Unnamed: 0,best,choose,easy,ishmael,question,things,times
0,1,0,0,0,0,0,1
1,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0
3,0,1,1,0,0,1,0


## TF-IDF

In [11]:
# Perform the TF-IDF transformation - Option 1 (TfidfVectorizer)
tf_idf_vec = TfidfVectorizer(stop_words = 'english')
tf_idf_jen = tf_idf_vec.fit_transform([sentence_1, sentence_2, sentence_3_clean])
tf_idf_jen

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (3, 5)>

In [12]:
print(sentence_1)
print(sentence_2)
print(sentence_3_clean)


Jen is a good student.
Jen is also a great guitarist.
Good student can sometime be good guitarist


In [13]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_jen.toarray(), columns = tf_idf_vec.get_feature_names_out())
tf_df.shape


(3, 5)

In [14]:
tf_df


Unnamed: 0,good,great,guitarist,jen,student
0,0.57735,0.0,0.0,0.57735,0.57735
1,0.0,0.680919,0.517856,0.517856,0.0
2,0.816497,0.0,0.408248,0.0,0.408248


In [15]:
# Perform the TF-IDF transformation - Option 2 (CountVectorizer + TfidfTransformer - better for large datasets)
tf_idf_tran = TfidfTransformer()
tf_idf_jen = tf_idf_tran.fit_transform(bow_vec)
tf_idf_jen

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (3, 5)>

In [16]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_jen.toarray(), columns = vectorizer.get_feature_names_out())
tf_df


Unnamed: 0,good,great,guitarist,jen,student
0,0.57735,0.0,0.0,0.57735,0.57735
1,0.0,0.680919,0.517856,0.517856,0.0
2,0.816497,0.0,0.408248,0.0,0.408248


In [17]:
# Get a data frame with the TF-IDF values sorted for document 0
df = pd.DataFrame(tf_idf_jen[0].T.todense(), index=tf_idf_vec.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df


Unnamed: 0,TF-IDF
good,0.57735
student,0.57735
jen,0.57735
guitarist,0.0
great,0.0


In [18]:
tf_df.transpose()[0].sort_values(ascending = False)

good         0.57735
student      0.57735
jen          0.57735
guitarist    0.00000
great        0.00000
Name: 0, dtype: float64

### Your Turn
1. Use the `TfidfTransformer` to transform the bag of words matrix you created above to TF-IDF.
2. Print out the results in a data frame.

In [19]:
# Solution 1
my_tf_idf_tran = TfidfTransformer()
my_tf_idf_jen = my_tf_idf_tran.fit_transform(my_bow_vec)
my_tf_idf_jen

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7 stored elements and shape (4, 7)>

In [20]:
my_tf_idf_jen.toarray()

array([[0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.        ,
        0.57735027, 0.        ]])

In [21]:
# Solution 2
# Print out results in a dataframe
my_tf_df = pd.DataFrame(my_tf_idf_jen.toarray(), columns = my_vectorizer.get_feature_names_out())
my_tf_df


Unnamed: 0,best,choose,easy,ishmael,question,things,times
0,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.57735,0.57735,0.0,0.0,0.57735,0.0


# Another Example - Using Wikipedia API

In [22]:
%%capture output
#install Wikipedia API
!pip3 install wikipedia-api


In [23]:
import wikipediaapi


In [24]:
# Pull out the popcorn page from wikipedia - https://en.wikipedia.org/wiki/Popcorn
topic = 'popcorn'
wikip = wikipediaapi.Wikipedia(user_agent = 'foobar')
page_ex = wikip.page(topic)
wiki_text = page_ex.text
wiki_text


'Popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated. The term also refers to the snack food produced by the expansion. It is one of the oldest snacks, with evidence of popcorn dating back thousands of years in the Americas. It is commonly eaten salted, sweetened, or with artificial flavorings. \nA popcorn kernel\'s strong hull contains the seed\'s hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated. Pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool.\nSome strains of corn (taxonomized as Zea mays) are cultivated specifically as popping corns. The Zea mays variety everta, a special kind of flint corn, is the most common of these. Popcorn is one of six major types of corn, which includes dent corn, flint corn, pod corn, flour corn, and sweet corn.\n\nHistory\nCorn

### Clean the text - version 1

Using string replace.



In [25]:
# Replace newline chars with spaces before doing any processing. Strip the ' and "s" from possessives
wiki_text_clean = (
    wiki_text
    .replace("\n"," ")
    .replace("'s",'')
    .replace("'",'')
)
wiki_text_clean


'Popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated. The term also refers to the snack food produced by the expansion. It is one of the oldest snacks, with evidence of popcorn dating back thousands of years in the Americas. It is commonly eaten salted, sweetened, or with artificial flavorings.  A popcorn kernel strong hull contains the seed hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated. Pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool. Some strains of corn (taxonomized as Zea mays) are cultivated specifically as popping corns. The Zea mays variety everta, a special kind of flint corn, is the most common of these. Popcorn is one of six major types of corn, which includes dent corn, flint corn, pod corn, flour corn, and sweet corn.  History Corn was domest

### Clean the text - version 2

Using a for..loop and string replace.




In [26]:
wiki_text_clean = wiki_text.lower()
for c in ["\n", "'s", "'", "  "]:
  wiki_text_clean = wiki_text_clean.replace(c," ")
wiki_text_clean

'popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated. the term also refers to the snack food produced by the expansion. it is one of the oldest snacks, with evidence of popcorn dating back thousands of years in the americas. it is commonly eaten salted, sweetened, or with artificial flavorings. a popcorn kernel strong hull contains the seed hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated. pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool. some strains of corn (taxonomized as zea mays) are cultivated specifically as popping corns. the zea mays variety everta, a special kind of flint corn, is the most common of these. popcorn is one of six major types of corn, which includes dent corn, flint corn, pod corn, flour corn, and sweet corn. history corn was domestic

### Clean the text - version 3

Using a regular expression.


In [27]:
import re

pat = re.compile(r"(\n|'s|'| )+")
wiki_text_clean = re.sub(pat,' ', wiki_text.lower())
wiki_text_clean


'popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated. the term also refers to the snack food produced by the expansion. it is one of the oldest snacks, with evidence of popcorn dating back thousands of years in the americas. it is commonly eaten salted, sweetened, or with artificial flavorings. a popcorn kernel strong hull contains the seed hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated. pressure from the steam continues to build until the hull ruptures, allowing the kernel to forcefully expand, to 20 to 50 times its original size, and then cool. some strains of corn (taxonomized as zea mays) are cultivated specifically as popping corns. the zea mays variety everta, a special kind of flint corn, is the most common of these. popcorn is one of six major types of corn, which includes dent corn, flint corn, pod corn, flour corn, and sweet corn. history corn was domestic

In [28]:
# Break up single string into separate sentences
wiki_blob = TextBlob(wiki_text_clean)
len(wiki_blob.sentences)


126

In [29]:
# Only look at first 5 sentences
my_sentences = wiki_blob.sentences[0:5]
my_sentences


[Sentence("popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated."),
 Sentence("the term also refers to the snack food produced by the expansion."),
 Sentence("it is one of the oldest snacks, with evidence of popcorn dating back thousands of years in the americas."),
 Sentence("it is commonly eaten salted, sweetened, or with artificial flavorings."),
 Sentence("a popcorn kernel strong hull contains the seed hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated.")]

In [30]:
# Convert text blob sentences to strings
my_sentences_str = [ str(x) for x in my_sentences ]
my_sentences_str


['popcorn (also called popped corn, popcorns, or pop-corn) is a variety of corn kernel which expands and puffs up when heated.',
 'the term also refers to the snack food produced by the expansion.',
 'it is one of the oldest snacks, with evidence of popcorn dating back thousands of years in the americas.',
 'it is commonly eaten salted, sweetened, or with artificial flavorings.',
 'a popcorn kernel strong hull contains the seed hard, starchy shell endosperm with 14–20% moisture, which turns to steam as the kernel is heated.']

In [31]:
# Perform the TF-IDF Vectorization
tf_idf_vec = TfidfVectorizer(stop_words = 'english')
tf_idf_pop = tf_idf_vec.fit_transform(my_sentences_str)
tf_idf_pop.shape


(5, 43)

In [32]:
tf_idf_pop

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 47 stored elements and shape (5, 43)>

In [33]:
tf_idf_pop.transpose().shape


(43, 5)

In [34]:
tf_idf_vec.get_feature_names_out()


array(['14', '20', 'americas', 'artificial', 'called', 'commonly',
       'contains', 'corn', 'dating', 'eaten', 'endosperm', 'evidence',
       'expands', 'expansion', 'flavorings', 'food', 'hard', 'heated',
       'hull', 'kernel', 'moisture', 'oldest', 'pop', 'popcorn',
       'popcorns', 'popped', 'produced', 'puffs', 'refers', 'salted',
       'seed', 'shell', 'snack', 'snacks', 'starchy', 'steam', 'strong',
       'sweetened', 'term', 'thousands', 'turns', 'variety', 'years'],
      dtype=object)

In [35]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_pop.toarray(), columns = tf_idf_vec.get_feature_names_out())
tf_df.transpose()


Unnamed: 0,0,1,2,3,4
14,0.0,0.0,0.0,0.0,0.244682
20,0.0,0.0,0.0,0.0,0.244682
americas,0.0,0.0,0.366408,0.0,0.0
artificial,0.0,0.0,0.0,0.408248,0.0
called,0.237354,0.0,0.0,0.0,0.0
commonly,0.0,0.0,0.0,0.408248,0.0
contains,0.0,0.0,0.0,0.0,0.244682
corn,0.712062,0.0,0.0,0.0,0.0
dating,0.0,0.0,0.366408,0.0,0.0
eaten,0.0,0.0,0.0,0.408248,0.0


In [36]:
# Get a data frame with the TF-IDF values sorted for document 0
df = pd.DataFrame(tf_idf_pop[0].T.todense(), index=tf_idf_vec.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df[:5]


Unnamed: 0,TF-IDF
corn,0.712062
called,0.237354
expands,0.237354
pop,0.237354
popcorns,0.237354


In [37]:
tf_df.T[[0]].sort_values([0], ascending=False)[:5]


Unnamed: 0,0
corn,0.712062
called,0.237354
expands,0.237354
pop,0.237354
popcorns,0.237354


In [38]:
(
tf_idf_pop[0]
.T
.todense()
)

matrix([[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.23735402],
        [0.        ],
        [0.        ],
        [0.71206206],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.23735402],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.19149573],
        [0.        ],
        [0.19149573],
        [0.        ],
        [0.        ],
        [0.23735402],
        [0.15895875],
        [0.23735402],
        [0.23735402],
        [0.        ],
        [0.23735402],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.23735402],
        [0.        ]])