In [None]:
import numpy as np
import pandas as pd

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

pd.options.display.max_columns = 100


### Gary Example


In [None]:
my_df = pd.DataFrame()
my_df["names"] = ['Amantha', 'Brendon', 'Nate', 'Sam', 'Betty', 'Christine', 'Gin', 'Ken', 'Susy']
my_df["ages"] = [ 19, 23, 24, 30, 16, 18, 22, 18, 15 ]
my_df["genders_txt"] = "female male male male female female female male female".split()
my_df["genders"] = [ 1, 0, 0, 0, 1, 1, 1, 0, 1 ]
my_df["music_band_txt"] = "Coldplay Coldplay LinkinPark LinkinPark Coldplay LinkinPark LinkinPark Coldplay Coldplay".split()

my_df


In [None]:
my_df.select_dtypes("int")

Fit nearest neighbors


In [None]:
nn = NearestNeighbors().fit(my_df.select_dtypes("int"))


Get nearest neighbors distances


In [None]:
gary = pd.DataFrame( {"ages": [23], "genders": [0] } )
gary

In [None]:
distances, indices = nn.kneighbors(
  X = gary,
  n_neighbors = 3,
)


In [None]:
distances[0]**2


In [None]:
indices[0]


Get people matching index


In [None]:
my_df.iloc[indices[0]]


Vote

In [None]:
my_df.iloc[indices[0]]["music_band_txt"].mode()[0]

Repeat with K = all rows

In [None]:
d_i = nn.kneighbors(gary, n_neighbors = my_df.shape[0])
distances, indices = np.array(d_i).reshape(2,9)
distances**2, indices


In [None]:
(
  my_df
    .iloc[indices]
    .join( pd.DataFrame( { "distances^2": distances**2 }, index = indices ) )
)

Display vote for various values of K $\epsilon$ { 1, 3, 5, 7, 9 }

In [None]:
for k in range(1,10,2):
  vote = my_df.iloc[indices]["music_band_txt"][:k].mode()[0]
  print(f"K = {k} : {vote}")


# NLP

If our text data are unlabelled (as is often the case in NLP), we can use KNN to identify documents that are similar to a given document.  In this example, our documents will be sentences and the given document will be the first sentence.

In [None]:
%%capture
!python -m textblob.download_corpora


In [None]:
sentences_orig = [
  'Jen is a good student.',
  'Jen is also a great guitarist.',
  'Good students can sometimes be good guitarists',
]
sentences_orig


# Data Cleaning
We want to singularize guitarists and students.

In [None]:
sentence_last_tb = TextBlob(sentences_orig[-1]) # Make a textblob so that we can singularize the word
sentence_last_singular = [ x.singularize() for x in sentence_last_tb.words ] # Singularize each word in the text
sentence_last_clean = ' '.join(sentence_last_singular) # Join it together into a single string
sentence_last_clean


In [None]:
sentences_clean = sentences_orig[:2] + [sentence_last_clean]
sentences_clean

## Bag of Words Using CountVectorizer

Perform the count transformation


In [None]:
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform(sentences_clean)


In [None]:
type(bow_matrix), bow_matrix.shape


In [None]:
bow_matrix.toarray()


## TF-IDF using BoW


Perform the TF-IDF transformation


In [None]:
tf_idf_matrix = TfidfTransformer()
tf_idf_jen = tf_idf_matrix.fit_transform(bow_matrix)


In [None]:
type(tf_idf_jen), tf_idf_jen.shape


In [None]:
tf_idf_jen.toarray()


Print out results in a dataframe


In [None]:
tf_df = pd.DataFrame(
  data = tf_idf_jen.toarray(),
  columns = vectorizer.get_feature_names_out(),
)
tf_df


> Note: Converting a sparse matrix to a data frame is NOT something you will normally do, especially for large matrices.

## K Nearest Neighbors

Fit nearest neighbors


In [None]:
nn = NearestNeighbors().fit(tf_idf_jen)


Create the reference matrix from the tf_idf matrix


In [None]:
sent0 = tf_idf_jen[0]
sent0.shape

Or ...

Create the reference matrix from the data frame


In [None]:
sent0 = np.array([tf_df.iloc[0]])
sent0.shape

Get nearest neighbors distances


In [None]:
distances, indices = nn.kneighbors(
  X = sent0,
  n_neighbors = 2,
)


In [None]:
distances


In [None]:
indices


Pull out the original sentences given the indices.

In [None]:
# Using list comprehension
[ x for i,x in enumerate(sentences_orig) if i in indices[0] ]

In [None]:
# Converting to Numpy array
np.array(sentences_orig)[indices]


# Another Example - Using Wikipedia API

## Get text and clean

Install Wikipedia API

In [None]:
%%capture
!pip3 install wikipedia-api

In [None]:
import wikipediaapi

Pull out page from Wikipedia


In [None]:
topic = 'munchkin'
wikip = wikipediaapi.Wikipedia('foobar')
page_ex = wikip.page(topic)
wiki_text = page_ex.text
wiki_text


Replace newline chars with spaces before doing any processing. Strip the ' and "s" from possessives.


In [None]:
wiki_text_clean = (
  wiki_text
  .replace("\n"," ")
  .replace("\'s",'')
  .replace('\'','')
  .replace("(", "")
  .replace(")", "")
  .replace('"', "")
)
wiki_text_clean


Convert to textblob

In [None]:
wiki_blob = TextBlob(wiki_text_clean)


Only look at first 5 sentences


In [None]:
my_sentences = wiki_blob.sentences[0:5]
my_sentences


In [None]:
len(wiki_blob.sentences)

Singularize and convert back to string


In [None]:
for i, sentence in enumerate(my_sentences):
  sing = [x.singularize() for x in sentence.words]
  my_sentences[i] = ' '.join(sing)
my_sentences


## TF-IDF without using BoW

Perform the TF-IDF Vectorization


In [None]:
tf_idf_matrix = TfidfVectorizer(stop_words = 'english')
tf_idf = tf_idf_matrix.fit_transform(my_sentences)


In [None]:
tf_idf.shape

Print out results in a data frame


In [None]:
results_df = pd.DataFrame(
  data = tf_idf.toarray(),
  columns = tf_idf_matrix.get_feature_names_out()
)
results_df.transpose()


## K Nearest Neighbors

Fit nearest neighbors


In [None]:
nn = NearestNeighbors().fit(tf_idf)


Get nearest neighbors distances to first sentence


In [None]:
distances, indices = nn.kneighbors(
  X = tf_idf[0],
  n_neighbors = 3,
)


In [None]:
distances


In [None]:
indices


In [None]:
np.array(my_sentences)[indices]
