In [2]:
import pandas as pd
import sklearn
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors

In [3]:
people = pd.read_csv('people_wiki.csv')

In [272]:
people[people['name'] =='Joe Biden']

Unnamed: 0,URI,name,text
24478,<http://dbpedia.org/resource/Joe_Biden>,Joe Biden,joseph robinette joe biden jr dosf rbnt badn born november 20 1942 is the 47th and current vice president of the united states jointly e...


In [273]:
biden = people[people['name'] == 'Joe Biden']

Bag Of Words

In [274]:
biden_text = biden["text"].values[0]
biden_text

'joseph robinette joe biden jr dosf rbnt badn born november 20 1942 is the 47th and current vice president of the united states jointly elected with president barack obama he is a member of the democratic party and was a united states senator from delaware from january 3 1973 until his resignation on january 15 2009 following his election to the vice presidency in 2012 biden was elected to a second term alongside obamabiden was born in scranton pennsylvania and lived there for ten years before moving to delaware he became an attorney in 1969 and was elected to the new castle county council in 1970 biden was first elected to the senate in 1972 and became the sixthyoungest senator in us history he was reelected to the senate six times and was the fourth most senior senator at the time of his resignation biden was a longtime member and former chairman of the foreign relations committee his strong advocacy helped bring about us military assistance and intervention during the bosnian war he

Вхідний текст на матрицю

In [275]:
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(biden['text'])
features

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 217 stored elements and shape (1, 217)>

In [276]:
# Беремо токени
vocab = vectorizer.get_feature_names_out()

# Знаходимо кількість входжень кожного слова тексту в DataFrame
dist = np.sum(features.toarray(), axis=0)

biden_word_count_table = pd.DataFrame({'token':vocab, 'count':dist}).sort_values('count')

Відсортуємо та виведемо цю таблицю за зменшенням кількості слів

In [277]:
stopwords = ['the', 'and', 'in', 'in', 'of', 'to', 'he', 'was', 'us', 'his', 'is', 'that', 'from', 'with']
filtered_table = biden_word_count_table[~biden_word_count_table['token'].isin(stopwords)]

filtered_table.sort_values('count', ascending=False).head(10)

Unnamed: 0,token,count
40,biden,8
151,president,5
18,act,5
207,vice,5
83,elected,4
137,obama,4
58,committee,3
14,2012,3
87,first,3
77,democratic,3


TF_IDF 

In [278]:
# Векторизація Tf Idf
vect_tfidf = TfidfVectorizer()

dtm_tfidf = vect_tfidf.fit_transform(people['text'])

vocab_tfidf = vect_tfidf.get_feature_names_out()

In [279]:
biden_index = biden.index[0]

# Make a dictionary із отриманих слів
word_index_dict = {word: idx for idx, word in enumerate(vocab_tfidf)}

In [280]:
# Важливість для кожного слова
biden_word_scores = {}
for word in vocab_tfidf:
    word_index = word_index_dict[word]
    biden_word_scores[ word ] = dtm_tfidf[biden_index, word_index]

Створюємо таблицю для виводу кожного слова із його важливістю

In [281]:
biden_word_scores_df = pd.DataFrame(biden_word_scores.items(), columns=['word', 'tfidf'])

# tf-idf for each word
biden_word_scores_df_sorted = biden_word_scores_df.sort_values('tfidf', ascending=False)

In [282]:
biden_word_scores_df_sorted.head(10)

Unnamed: 0,word,tfidf
89034,biden,0.538171
488148,the,0.248287
358557,obama,0.174794
45073,act,0.167737
518424,vice,0.153152
58906,and,0.143045
251905,in,0.120486
417560,resolved,0.113684
442677,senator,0.110698
512199,us,0.110289


In [283]:
# Delete "the", "in", "of"
vect = TfidfVectorizer(stop_words = 'english')
dtm2 = vect.fit_transform(people['text'])
vocab2 = vect.get_feature_names_out()

In [284]:
biden_word_scores = {}
for word in vocab2:
    word_index = word_index_dict[word]
    biden_word_scores[word] = dtm_tfidf[biden_index, word_index]

In [285]:
biden_word_scores_tfidf = pd.DataFrame(biden_word_scores.items(), columns=['word', 'tfidf'])
biden_word_scores_tfidf_sorted = biden_word_scores_tfidf.sort_values('tfidf', ascending=False)

In [286]:
biden_word_scores_tfidf_sorted

Unnamed: 0,word,tfidf
88985,biden,0.538171
358378,obama,0.174794
45070,act,0.167737
518147,vice,0.153152
417356,resolved,0.113684
...,...,...
182745,equestrian,0.000000
182746,equestrianism,0.000000
182747,equestrians,0.000000
182748,equestriansshe,0.000000


Розрахунок міри подібності між людьми на базі TF_IDF за допомогою функції cosine_similarity

In [287]:
lieberman = people[people['name'] == 'Joe Lieberman']
lieberman_text = lieberman["text"].values[0]
lieberman_text

'joseph isadore joe lieberman born february 24 1942 is a former united states senator from connecticut a former member of the democratic party he was the partys nominee for vice president in the 2000 election currently an independent he remains closely associated with the partyborn in stamford connecticut lieberman is a graduate of yale college and yale law school he was elected as a reform democrat in 1970 to the connecticut senate where he served three terms as majority leader after an unsuccessful bid for the us house of representatives in 1980 he served as state attorney general from 1983 to 1989 lieberman defeated moderate republican lowell weicker in 1988 to win election to the senate and was reelected in 1994 and 2000 lieberman was the democratic nominee for vice president in the 2000 united states presidential election running with presidential nominee al gore becoming the first jewish candidate on a major american political party presidential ticketin the 2000 presidential ele

In [288]:
similarity_b_l = cosine_similarity(dtm2[biden.index], dtm2[lieberman.index])
# cosine_distance = (1-similarity_b_l)

distance_b_l = pairwise_distances(dtm2[biden.index], dtm2[lieberman.index], metric='cosine')

In [289]:
similarity_b_l

array([[0.14350046]])

In [290]:
distance_b_l

array([[0.85649954]])

Модель NNM ׂ(Nearest Neighbor Model, використовується косінусна відстань) 

In [322]:
knn_model = NearestNeighbors(n_neighbors=10, metric='cosine')

# Для пошуку найближчих сусідів будемо використовувати модель тексту TF_IDF, і знаходити сусідів для конкретного імені
knn_model.fit(dtm2)

In [335]:
distances, indices = knn_model.kneighbors(dtm2[biden.index], return_distance=True)

In [336]:
list_nn = list(zip(people.iloc[indices[0]]['name'], distances[0]))
list_nn

[('Joe Biden', np.float64(0.0)),
 ('Jill Biden', np.float64(0.5369083361227046)),
 ('Cynthia Hogan', np.float64(0.606528555050327)),
 ('Barack Obama', np.float64(0.6787810413312336)),
 ('Sheila Nix', np.float64(0.681517961895459)),
 ('Chris Coons', np.float64(0.7348553857706975)),
 ('Kenneth D. Thompson', np.float64(0.7836254797930652)),
 ('Michael Castle', np.float64(0.8070862442922863)),
 ('Jeff Sessions', np.float64(0.8226375257713407)),
 ('Neil MacBride', np.float64(0.8290726990260255))]

In [341]:
first_nn = indices[0][1]
second_nn = indices[0][2]

first_name = people.iloc[first_nn]['name']
second_name = people.iloc[second_nn]['name']

first_text = people.iloc[first_nn]['text']
second_text = people.iloc[second_nn]['text']

print(f"{first_name}\n{first_text}\n")
print(f"{second_name}\n{second_text}")


Jill Biden
jill tracy biden ne jacobs previously stevenson born june 3 1951 is an american educator and as the wife of the 47th and current us vice president joe biden is the second lady of the united statesshe was born in hammonton new jersey and grew up in willow grove pennsylvania she married joe biden in 1977 and became stepmother to his two young sons from his first marriage beau and hunter whose mother and baby sister died in a car accident joe and jill biden have a daughter ashley born in 1981jill biden has a bachelors degree from the university of delaware masters degrees from west chester university and villanova university and a doctoral degree from the university of delaware she taught english and reading in high schools for 13 years and also taught adolescents with emotional disabilities at a psychiatric hospital from 1993 to 2008 she was an english and writing instructor at delaware technical community college since 2009 she has been a professor of english at northern virg