In [None]:
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pd.options.display.max_columns = 100


In [None]:
%%capture
!python -m textblob.download_corpora


In [None]:
sentence_1 = 'Jen is a good student.'
sentence_2 = 'Jen is also a great guitarist.'
sentence_3 = 'Good students can sometimes be good guitarists'


# Data Cleaning
We want to singularize guitarists and students.

In [None]:
sentence_3_tb = TextBlob(sentence_3) # Make a textblob so that we can singularize the word
sentence_3_singular = [x.singularize() for x in sentence_3_tb.words] # Singularize each word in the text
sentence_3_clean = ' '.join(sentence_3_singular) # Join it together into a single string
sentence_3_clean


## Bag of Words Using CountVectorizer

In [None]:
# Perform the count transformation
vectorizer = CountVectorizer(stop_words='english')
bow_vec = vectorizer.fit_transform([sentence_1, sentence_2, sentence_3_clean])
bow_vec


In [None]:
bow_vec.toarray()


In [None]:
# Print out results in a data frame
sent_df = pd.DataFrame(bow_vec.toarray(), columns = vectorizer.get_feature_names_out())
sent_df


### Your Turn
1. Write 4 sentences of your choice.
2. Run the `CountVectorizer` on your sentences.
3. Print the results in a data frame.

In [None]:
# Solution 1


In [None]:
# Solution 2


In [None]:
# Solution 3


## TF-IDF

In [None]:
# Perform the TF-IDF transformation - Option 1 (TfidfVectorizer)
tf_idf_vec = TfidfVectorizer(stop_words = 'english')
tf_idf_jen = tf_idf_vec.fit_transform([sentence_1, sentence_2, sentence_3_clean])


In [None]:
print(sentence_1)
print(sentence_2)
print(sentence_3_clean)


In [None]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_jen.toarray(), columns = tf_idf_vec.get_feature_names_out())
tf_df.shape


In [None]:
tf_df


In [None]:
# Perform the TF-IDF transformation - Option 2 (CountVectorizer + TfidfTransformer - better for large datasets)
tf_idf_tran = TfidfTransformer()
tf_idf_jen = tf_idf_tran.fit_transform(bow_vec)


In [None]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_jen.toarray(), columns = vectorizer.get_feature_names_out())
tf_df


In [None]:
# Get a data frame with the TF-IDF values sorted for document 0
df = pd.DataFrame(tf_idf_jen[0].T.todense(), index=tf_idf_vec.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df


In [None]:
tf_df.transpose()[0].sort_values(ascending = False)

### Your Turn
1. Use the `TfidfTransformer` to transform the bag of words matrix you created above to TF-IDF.
2. Print out the results in a data frame.

In [None]:
# Solution 1


In [None]:
# Solution 2


# Another Example - Using Wikipedia API

In [None]:
%%capture output
#install Wikipedia API
!pip3 install wikipedia-api


In [None]:
import wikipediaapi


In [None]:
# Pull out the popcorn page from wikipedia - https://en.wikipedia.org/wiki/Popcorn
topic = 'popcorn'
wikip = wikipediaapi.Wikipedia(user_agent = 'foobar')
page_ex = wikip.page(topic)
wiki_text = page_ex.text
wiki_text


### Clean the text - version 1

Using string replace.



In [None]:
# Replace newline chars with spaces before doing any processing. Strip the ' and "s" from possessives
wiki_text_clean = (
    wiki_text
    .replace("\n"," ")
    .replace("'s",'')
    .replace("'",'')
)
wiki_text_clean


### Clean the text - version 2

Using a for..loop and string replace.




In [None]:
wiki_text_clean = wiki_text.lower()
for c in ["\n", "'s", "'", "  "]:
  wiki_text_clean = wiki_text_clean.replace(c," ")
wiki_text_clean

### Clean the text - version 3

Using a regular expression.


In [None]:
import re

pat = re.compile(r"(\n|'s|'| )+")
wiki_text_clean = re.sub(pat,' ', wiki_text.lower())
wiki_text_clean


In [None]:
# Break up single string into separate sentences
wiki_blob = TextBlob(wiki_text_clean)
len(wiki_blob.sentences)


In [None]:
# Only look at first 5 sentences
my_sentences = wiki_blob.sentences[0:5]
my_sentences


In [None]:
# Convert text blob sentences to strings
my_sentences_str = [ str(x) for x in my_sentences ]
my_sentences_str


In [None]:
# Perform the TF-IDF Vectorization
tf_idf_vec = TfidfVectorizer(stop_words = 'english')
tf_idf_pop = tf_idf_vec.fit_transform(my_sentences_str)
tf_idf_pop.shape


In [None]:
tf_idf_pop.transpose().shape


In [None]:
tf_idf_vec.get_feature_names_out()


In [None]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_pop.toarray(), columns = tf_idf_vec.get_feature_names_out())
tf_df.transpose()


In [None]:
# Get a data frame with the TF-IDF values sorted for document 0
df = pd.DataFrame(tf_idf_pop[0].T.todense(), index=tf_idf_vec.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df[:5]


In [None]:
tf_df.T[[0]].sort_values([0], ascending=False)[:5]
