In [None]:
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer as BagOfWords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
pd.options.display.max_columns = 100


# NLP

If our text data are unlabelled (as is often the case in NLP), we can use KNN to identify documents that are similar to a given document.  Let's test this by creating simple documents that are similar and different.




In [None]:
%%capture
!python -m textblob.download_corpora


In [None]:
sentences = []
sentences += [ 'alpaca ' * 5 + "zebra" ]
sentences += [ 'bird ' * 5 + "zebra" ]
sentences += [ 'cat ' * 5 + "zebra" ]
sentences += [ 'dog ' * 5 + "zebra" ]
sentences += [ "alpaca bird cat dog zebra" ]
sentences

['alpaca alpaca alpaca alpaca alpaca zebra',
 'bird bird bird bird bird zebra',
 'cat cat cat cat cat zebra',
 'dog dog dog dog dog zebra',
 'alpaca bird cat dog zebra']

# Data Cleaning


In [None]:
# we can create data that needs cleaning

## Bag of Words Using CountVectorizer

In [None]:
# Perform the count transformation
BoW =  BagOfWords(stop_words='english')
vec = BoW.fit_transform(sentences)
vec.toarray()


array([[5, 0, 0, 0, 1],
       [0, 5, 0, 0, 1],
       [0, 0, 5, 0, 1],
       [0, 0, 0, 5, 1],
       [1, 1, 1, 1, 1]])

In [None]:
BoW.get_feature_names_out()


array(['alpaca', 'bird', 'cat', 'dog', 'zebra'], dtype=object)

In [None]:
pd.DataFrame( vec.toarray(), columns = BoW.get_feature_names_out() )

Unnamed: 0,alpaca,bird,cat,dog,zebra
0,5,0,0,0,1
1,0,5,0,0,1
2,0,0,5,0,1
3,0,0,0,5,1
4,1,1,1,1,1


## TF-IDF

In [None]:
# Perform the TF-IDF transformation
tf_idf_vec = TfidfTransformer()
tf_idf_jen = tf_idf_vec.fit_transform(vec)
tf_idf_jen.toarray()


array([[0.99309562, 0.        , 0.        , 0.        , 0.11730765],
       [0.        , 0.99309562, 0.        , 0.        , 0.11730765],
       [0.        , 0.        , 0.99309562, 0.        , 0.11730765],
       [0.        , 0.        , 0.        , 0.99309562, 0.11730765],
       [0.47952794, 0.47952794, 0.47952794, 0.47952794, 0.28321692]])

In [None]:
# Print out results in a dataframe
tf_df = pd.DataFrame(tf_idf_jen.toarray(), columns = BoW.get_feature_names_out())
tf_df


Unnamed: 0,alpaca,bird,cat,dog,zebra
0,0.993096,0.0,0.0,0.0,0.117308
1,0.0,0.993096,0.0,0.0,0.117308
2,0.0,0.0,0.993096,0.0,0.117308
3,0.0,0.0,0.0,0.993096,0.117308
4,0.479528,0.479528,0.479528,0.479528,0.283217


## K Nearest Neighbors

In [None]:
# Fit nearest neighbors
nn = NearestNeighbors().fit(tf_idf_jen)


In [None]:
# Get nearest neighbors distances
sent0 = np.array([tf_df.iloc[0]])
distances, indices = nn.kneighbors(sent0, n_neighbors = len(tf_df))


In [None]:
distances


array([[0.       , 0.9905144, 1.4044493, 1.4044493, 1.4044493]])

In [None]:
indices


array([[0, 4, 1, 2, 3]])

In [None]:
for a,b in zip(distances[0], np.array(sentences)[indices][0]):
  print(f"{a:.4f}: {b}")

0.0000: alpaca alpaca alpaca alpaca alpaca zebra
0.9905: alpaca bird cat dog zebra
1.4044: bird bird bird bird bird zebra
1.4044: cat cat cat cat cat zebra
1.4044: dog dog dog dog dog zebra
