In [1]:
import numpy as np
import pandas as pd

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

pd.options.display.max_columns = 100

import nltk
# nltk.download('omw-1.4')
nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Gary Example


In [2]:
my_df = pd.DataFrame()
my_df["names"] = ['Amantha', 'Brendon', 'Nate', 'Sam', 'Betty', 'Christine', 'Gin', 'Ken', 'Susy']
my_df["ages"] = [ 19, 23, 24, 30, 16, 18, 22, 18, 15 ]
my_df["genders_txt"] = "female male male male female female female male female".split()
my_df["genders"] = [ 1, 0, 0, 0, 1, 1, 1, 0, 1 ]
my_df["music_band_txt"] = "Coldplay Coldplay LinkinPark LinkinPark Coldplay LinkinPark LinkinPark Coldplay Coldplay".split()

my_df


Unnamed: 0,names,ages,genders_txt,genders,music_band_txt
0,Amantha,19,female,1,Coldplay
1,Brendon,23,male,0,Coldplay
2,Nate,24,male,0,LinkinPark
3,Sam,30,male,0,LinkinPark
4,Betty,16,female,1,Coldplay
5,Christine,18,female,1,LinkinPark
6,Gin,22,female,1,LinkinPark
7,Ken,18,male,0,Coldplay
8,Susy,15,female,1,Coldplay


In [3]:
my_df.select_dtypes("int")

Unnamed: 0,ages,genders
0,19,1
1,23,0
2,24,0
3,30,0
4,16,1
5,18,1
6,22,1
7,18,0
8,15,1


Fit nearest neighbors


In [4]:
nn = NearestNeighbors().fit(my_df.select_dtypes("int"))


Get nearest neighbors distances


In [5]:
gary = pd.DataFrame( {"ages": [23], "genders": [0] } )
gary

Unnamed: 0,ages,genders
0,23,0


In [6]:
distances, indices = nn.kneighbors(
  X = gary,
  n_neighbors = 3,
)


In [7]:
distances[0]**2


array([0., 1., 2.])

In [8]:
indices[0]


array([1, 2, 6])

Get people matching index


In [9]:
my_df.iloc[indices[0]]


Unnamed: 0,names,ages,genders_txt,genders,music_band_txt
1,Brendon,23,male,0,Coldplay
2,Nate,24,male,0,LinkinPark
6,Gin,22,female,1,LinkinPark


Vote

In [10]:
my_df.iloc[indices[0]]["music_band_txt"].mode()[0]

'LinkinPark'

Repeat with K = all rows

In [11]:
d_i = nn.kneighbors(gary, n_neighbors = my_df.shape[0])
distances, indices = np.array(d_i).reshape(2,9)
distances**2, indices


(array([ 0.,  1.,  2., 17., 25., 26., 49., 50., 65.]),
 array([1., 2., 6., 0., 7., 5., 3., 4., 8.]))

In [12]:
(
  my_df
    .iloc[indices]
    .join( pd.DataFrame( { "distances^2": distances**2 }, index = indices ) )
)

Unnamed: 0,names,ages,genders_txt,genders,music_band_txt,distances^2
1.0,Brendon,23,male,0,Coldplay,0.0
2.0,Nate,24,male,0,LinkinPark,1.0
6.0,Gin,22,female,1,LinkinPark,2.0
0.0,Amantha,19,female,1,Coldplay,17.0
7.0,Ken,18,male,0,Coldplay,25.0
5.0,Christine,18,female,1,LinkinPark,26.0
3.0,Sam,30,male,0,LinkinPark,49.0
4.0,Betty,16,female,1,Coldplay,50.0
8.0,Susy,15,female,1,Coldplay,65.0


Display vote for various values of K $\epsilon$ { 1, 3, 5, 7, 9 }

In [13]:
for k in range(1,10,2):
  vote = my_df.iloc[indices]["music_band_txt"][:k].mode()[0]
  print(f"K = {k} : {vote}")


K = 1 : Coldplay
K = 3 : LinkinPark
K = 5 : Coldplay
K = 7 : LinkinPark
K = 9 : Coldplay


# NLP

If our text data are unlabelled (as is often the case in NLP), we can use KNN to identify documents that are similar to a given document.  In this example, our documents will be sentences and the given document will be the first sentence.

In [14]:
%%capture
!python -m textblob.download_corpora


In [15]:
sentences_orig = [
  'Jen is a good student.',
  'Jen is also a great guitarist.',
  'Good students can sometimes be good guitarists',
]
sentences_orig


['Jen is a good student.',
 'Jen is also a great guitarist.',
 'Good students can sometimes be good guitarists']

# Data Cleaning
We want to singularize guitarists and students.

In [16]:
sentence_last_tb = TextBlob(sentences_orig[-1]) # Make a textblob so that we can singularize the word
sentence_last_singular = [ x.singularize() for x in sentence_last_tb.words ] # Singularize each word in the text
sentence_last_clean = ' '.join(sentence_last_singular) # Join it together into a single string
sentence_last_clean


'Good student can sometime be good guitarist'

In [17]:
sentences_clean = sentences_orig[:2] + [sentence_last_clean]
sentences_clean

['Jen is a good student.',
 'Jen is also a great guitarist.',
 'Good student can sometime be good guitarist']

## Bag of Words Using CountVectorizer

Perform the count transformation


In [18]:
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform(sentences_clean)


In [19]:
type(bow_matrix), bow_matrix.shape


(scipy.sparse._csr.csr_matrix, (3, 5))

In [20]:
bow_matrix.toarray()


array([[1, 0, 0, 1, 1],
       [0, 1, 1, 1, 0],
       [2, 0, 1, 0, 1]])

## TF-IDF using BoW


Perform the TF-IDF transformation


In [21]:
tf_idf_matrix = TfidfTransformer()
tf_idf_jen = tf_idf_matrix.fit_transform(bow_matrix)


In [22]:
type(tf_idf_jen), tf_idf_jen.shape


(scipy.sparse._csr.csr_matrix, (3, 5))

In [23]:
tf_idf_jen.toarray()


array([[0.57735027, 0.        , 0.        , 0.57735027, 0.57735027],
       [0.        , 0.68091856, 0.51785612, 0.51785612, 0.        ],
       [0.81649658, 0.        , 0.40824829, 0.        , 0.40824829]])

Print out results in a dataframe


In [24]:
tf_df = pd.DataFrame(
  data = tf_idf_jen.toarray(),
  columns = vectorizer.get_feature_names_out(),
)
tf_df


Unnamed: 0,good,great,guitarist,jen,student
0,0.57735,0.0,0.0,0.57735,0.57735
1,0.0,0.680919,0.517856,0.517856,0.0
2,0.816497,0.0,0.408248,0.0,0.408248


> Note: Converting a sparse matrix to a data frame is NOT something you will normally do, especially for large matrices.

## K Nearest Neighbors

Fit nearest neighbors


In [25]:
nn = NearestNeighbors().fit(tf_idf_jen)


Create the reference matrix from the tf_idf matrix


In [26]:
sent0 = tf_idf_jen[0]
sent0.shape

(1, 5)

Or ...

Create the reference matrix from the data frame


In [27]:
sent0 = np.array([tf_df.iloc[0]])
sent0.shape

(1, 5)

Get nearest neighbors distances


In [28]:
distances, indices = nn.kneighbors(
  X = sent0,
  n_neighbors = 2,
)


In [29]:
distances


array([[0.        , 0.76536686]])

In [30]:
indices


array([[0, 2]])

Pull out the original sentences given the indices.

In [31]:
# Using list comprehension
[ x for i,x in enumerate(sentences_orig) if i in indices[0] ]

['Jen is a good student.', 'Good students can sometimes be good guitarists']

In [32]:
# Converting to Numpy array
np.array(sentences_orig)[indices]


array([['Jen is a good student.',
        'Good students can sometimes be good guitarists']], dtype='<U46')

# Another Example - Using Wikipedia API

## Get text and clean

Install Wikipedia API

In [33]:
%%capture
!pip3 install wikipedia-api

In [34]:
import wikipediaapi

Pull out page from Wikipedia


In [35]:
topic = 'munchkin'
wikip = wikipediaapi.Wikipedia('foobar')
page_ex = wikip.page(topic)
wiki_text = page_ex.text
wiki_text


'A Munchkin is a native of the fictional Munchkin Country in the Oz books by American author L. Frank Baum. They first appear in the classic children\'s novel The Wonderful Wizard of Oz (1900) where they welcome Dorothy Gale to their city in Oz. The Munchkins are described as being the same height as Dorothy and they wear only shades of blue clothing, as blue is the Munchkins\' favorite color. Blue is also the predominating color that officially represents the eastern quadrant in the Land of Oz. The Munchkins have appeared in various media, including the 1939 film The Wizard of Oz, as well as in various other films and comedy acts.\n\nConcept\nWhile Baum may have written about it, there are no surviving notes for the composition of The Wonderful Wizard of Oz. The lack of this information has resulted in speculation of the term origins he used in the book, which include the word Munchkin. Baum researcher Brian Attebery has hypothesized that there might be a connection to the Münchner Ki

Replace newline chars with spaces before doing any processing. Strip the ' and "s" from possessives.


In [36]:
wiki_text_clean = (
  wiki_text
  .replace("\n"," ")
  .replace("\'s",'')
  .replace('\'','')
  .replace("(", "")
  .replace(")", "")
  .replace('"', "")
)
wiki_text_clean


'A Munchkin is a native of the fictional Munchkin Country in the Oz books by American author L. Frank Baum. They first appear in the classic children novel The Wonderful Wizard of Oz 1900 where they welcome Dorothy Gale to their city in Oz. The Munchkins are described as being the same height as Dorothy and they wear only shades of blue clothing, as blue is the Munchkins favorite color. Blue is also the predominating color that officially represents the eastern quadrant in the Land of Oz. The Munchkins have appeared in various media, including the 1939 film The Wizard of Oz, as well as in various other films and comedy acts.  Concept While Baum may have written about it, there are no surviving notes for the composition of The Wonderful Wizard of Oz. The lack of this information has resulted in speculation of the term origins he used in the book, which include the word Munchkin. Baum researcher Brian Attebery has hypothesized that there might be a connection to the Münchner Kindl, the e

Convert to textblob

In [37]:
wiki_blob = TextBlob(wiki_text_clean)


Only look at first 5 sentences


In [38]:
my_sentences = wiki_blob.sentences[0:20]
my_sentences


[Sentence("A Munchkin is a native of the fictional Munchkin Country in the Oz books by American author L. Frank Baum."),
 Sentence("They first appear in the classic children novel The Wonderful Wizard of Oz 1900 where they welcome Dorothy Gale to their city in Oz."),
 Sentence("The Munchkins are described as being the same height as Dorothy and they wear only shades of blue clothing, as blue is the Munchkins favorite color."),
 Sentence("Blue is also the predominating color that officially represents the eastern quadrant in the Land of Oz."),
 Sentence("The Munchkins have appeared in various media, including the 1939 film The Wizard of Oz, as well as in various other films and comedy acts."),
 Sentence("Concept While Baum may have written about it, there are no surviving notes for the composition of The Wonderful Wizard of Oz."),
 Sentence("The lack of this information has resulted in speculation of the term origins he used in the book, which include the word Munchkin."),
 Sentence("Ba

In [39]:
len(wiki_blob.sentences)

109

Singularize and convert back to string


In [40]:
for i, sentence in enumerate(my_sentences):
  sing = [x.singularize() for x in sentence.words]
  my_sentences[i] = ' '.join(sing)
my_sentences


['A Munchkin is a native of the fictional Munchkin Country in the Oz book by American author L Frank Baum',
 'They first appear in the classic child novel The Wonderful Wizard of Oz 1900 where they welcome Dorothy Gale to their city in Oz',
 'The Munchkin are described a being the same height a Dorothy and they wear only shade of blue clothing a blue is the Munchkin favorite color',
 'Blue is also the predominating color that officially represent the eastern quadrant in the Land of Oz',
 'The Munchkin have appeared in variou medium including the 1939 film The Wizard of Oz a well a in variou other film and comedy act',
 'Concept While Baum may have written about it there are no surviving note for the composition of The Wonderful Wizard of Oz',
 'The lack of thi information ha resulted in speculation of the term origin he used in the book which include the word Munchkin',
 'Baum researcher Brian Attebery ha hypothesized that there might be a connection to the Münchner Kindl the emblem of

## TF-IDF without using BoW

Perform the TF-IDF Vectorization


In [41]:
tf_idf_matrix = TfidfVectorizer(stop_words = 'english')
tf_idf = tf_idf_matrix.fit_transform(my_sentences)


In [42]:
tf_idf.shape

(20, 166)

Print out results in a data frame


In [43]:
results_df = pd.DataFrame(
  data = tf_idf.toarray(),
  columns = tf_idf_matrix.get_feature_names_out()
)
results_df.transpose()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
13th,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.307943,0.0000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
1900,0.000000,0.273863,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0000,0.0,0.000000,0.282341,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
1939,0.000000,0.000000,0.0,0.0,0.256986,0.000000,0.000000,0.0,0.000000,0.0000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
act,0.000000,0.000000,0.0,0.0,0.256986,0.000000,0.000000,0.0,0.000000,0.0000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
american,0.361256,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wonderful,0.000000,0.247119,0.0,0.0,0.000000,0.302110,0.000000,0.0,0.000000,0.0000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.219328,0.0,0.0,0.000000
word,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.246257,0.0,0.000000,0.0000,0.0,0.275297,0.000000,0.0,0.0,0.258464,0.000000,0.0,0.0,0.000000
written,0.000000,0.000000,0.0,0.0,0.000000,0.380887,0.000000,0.0,0.000000,0.0000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
year,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.2664,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000


## K Nearest Neighbors

Fit nearest neighbors


In [44]:
nn = NearestNeighbors().fit(tf_idf)


Get nearest neighbors distances to first sentence


In [45]:
distances, indices = nn.kneighbors(
  X = tf_idf[0],
  n_neighbors = 3,
)


In [46]:
distances


array([[0.        , 1.14391441, 1.26882977]])

In [47]:
indices


array([[ 0, 16, 15]])

In [48]:
np.array(my_sentences)[indices]


array([['A Munchkin is a native of the fictional Munchkin Country in the Oz book by American author L Frank Baum',
        'Literature Oz Book by Frank Baum The Munchkin are first mentioned quote shown in an excerpt from chapter two of The Wonderful Wizard of Oz titled The Council with the Munchkin',
        'Like the other Oz term the word Munchkin end in a diminutive which in thi case refer to the size of the native']],
      dtype='<U175')