https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [1]:
import math
import os
import fnmatch # https://docs.python.org/3/library/fnmatch.html

Source https://stevenloria.com/tf-idf/ <BR>

Caveat: this post now uses TextBlob for breaking up the text into words and getting the word counts.

In [2]:
def term_freq(term, list_of_words_in_document):
    """
    computes "term frequency" which is the number of times a word appears in a document, 
    normalized by dividing by the total number of words in document. 
    """
    return list_of_words_in_document.count(term)/(len(list_of_words_in_document)*1.0)

def number_of_documents_containing(term,all_documents):
    """
    Returns the number of documents containing word. 
    """
    countr=0
    for this_doc in all_documents:
        if (term in this_doc):
            countr+=1
    return countr

def inverse_doc_freq(term, all_documents):
    """
    computes "inverse document frequency" which measures 
    how common a word is among all documents in corpus. 
    The more common a word is, the lower its idf. 
    Take the ratio of the total number of documents to the number of documents containing word, 
    then take the log of that. Add 1 to the divisor to prevent division by zero.
    """
    return math.log(len(all_documents) / ( 1.0 + number_of_documents_containing(term, all_documents)))

def tfidf(term, list_of_words_in_document, all_documents):
    """
    computes the TF-IDF score. It's the product of tf and idf.
    """
    return term_freq(term, list_of_words_in_document) * inverse_doc_freq(term, all_documents)

The \*.dat files in the directory have only key words from each file

Convert the .dat contents to lists per document

In [3]:
all_documents={}
all_words_from_all_docs=[]
all_terms=[]
foldr='data/'
fname='*.dat'
for file_name in os.listdir(foldr):
    #print(file_name)
    if fnmatch.fnmatch(file_name, fname): # Unix shell-style wildcards
        print(file_name)
        with open(foldr+file_name,'r') as fil:
            words_in_file=fil.read().split("\n")
        # remove empty strings from list of words
        while "" in words_in_file:
            words_in_file.remove("")
        # save the words per file as value in a dictionary
        all_documents[file_name]=words_in_file
        print('has',len(words_in_file),'words\n')
        # also save all the words to a list
        for this_word in words_in_file:
            all_words_from_all_docs.append(this_word)
            

week1_50 years of data science v2.pdf.dat
has 90 words

week1_a Very Short History Of Data Science_1.docx.dat
has 96 words

week1_assignment 1 Summary.pdf.dat
has 110 words

week1_380.txt.dat
has 106 words

week1_A Very Short History Of Data Science.docx.dat
has 94 words

week1_50 Years Data Science Summary.docx.dat
has 133 words



In [4]:
len(all_documents)

6

In [5]:
all_words_from_all_docs = list(set(all_words_from_all_docs))
len(all_words_from_all_docs)

482

Sample sizes are small, so results are not reliable representations of the document

In [6]:
for doc_name, word_list_in_this_doc in all_documents.items():
    if (len(word_list_in_this_doc)==0):
        print("error: empty input file"+doc_name)
    else:
        dic_of_terms={}
        for this_term in word_list_in_this_doc:
            dic_of_terms[this_term] = tfidf(this_term, word_list_in_this_doc, all_words_from_all_docs)
        #print(dic_of_terms)
        print('\n'+doc_name)
        terms_in_doc_sorted_by_score=sorted(dic_of_terms.items(), key=lambda x: x[1], reverse=True)
        # first 40 words by importance
        for this_tup in terms_in_doc_sorted_by_score[0:40]:
            print(this_tup)
#        for indx in range(10):
#            print(terms_in_doc_sorted_by_score[indx][0] + ":"+str(terms_in_doc_sorted_by_score[indx][1]))


week1_50 years of data science v2.pdf.dat
('princeton', 0.060942188149896166)
('interesting', 0.060942188149896166)
('larger', 0.060942188149896166)
('summary', 0.060942188149896166)
('training', 0.060942188149896166)
('version', 0.060942188149896166)
('programs', 0.060942188149896166)
('successes', 0.060942188149896166)
('important', 0.060942188149896166)
('caused', 0.060942188149896166)
('knowledge', 0.060942188149896166)
('demand', 0.060942188149896166)
('interpreting', 0.060942188149896166)
('value', 0.060942188149896166)
('fit', 0.060942188149896166)
('analyzing', 0.060942188149896166)
('centennial', 0.060942188149896166)
('would', 0.060942188149896166)
('finally', 0.060942188149896166)
('necessary', 0.060942188149896166)
('volumes', 0.060942188149896166)
('described', 0.060942188149896166)
('collecting', 0.060942188149896166)
('skills', 0.060942188149896166)
('sprung', 0.060942188149896166)
('new', 0.060942188149896166)
('across', 0.060942188149896166)
('networking', 0.060942188

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

# https://www.kaggle.com/sameersmahajan/people-wikipedia-data

In [8]:
# text.ENGLISH_STOP_WORDS

In [9]:
df = pd.read_csv('./data/people_wiki.csv')

In [10]:
df.shape

(42786, 3)

In [11]:
df.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [12]:
df['text'][0]

'digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his early senior football for west perth his 44game senior career for the falcons spanned 19982000 and he was the clubs leading goalkicker in 2000 at the age of 21 morrell was recruited to the australian football league by the kangaroos football club with its third round selection in the 2001 afl rookie draft as a forward he twice kicked five goals during his time with the kangaroos the first was in a losing cause against sydney in 2002 and the other the following season in a drawn game against brisbaneafter the 2003 season morrell was traded along with david teague to the carlton football club in exchange for corey mckernan he played 32 games for the blues before being delisted at the end of 2005 he continued to play victorian football league vfl football with the northern bullants carltons vfla

In [13]:
vectorizer = TfidfVectorizer(
    analyzer='word',
    lowercase=True,
    stop_words=text.ENGLISH_STOP_WORDS,
    max_features=100000
)
X_matrix = vectorizer.fit_transform(df['text'])

In [14]:
X_matrix[0]

<1x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 103 stored elements in Compressed Sparse Row format>

In [15]:
len(vectorizer.get_feature_names())

100000

In [16]:
vectorizer.get_feature_names()[::200]

['00',
 '1129',
 '151',
 '1894',
 '19581964',
 '1969his',
 '1977',
 '1983during',
 '1990his',
 '1995s',
 '200002',
 '2004since',
 '2010',
 '2030',
 '2530',
 '332nd',
 '43',
 '532',
 '665',
 '831',
 'aa',
 'abinanti',
 'accidental',
 'actively',
 'adjudication',
 'aerodynamicist',
 'agein',
 'aidsactivist',
 'akers',
 'alberto',
 'alexie',
 'allawis',
 'almaden',
 'altos',
 'ameche',
 'amplio',
 'andes',
 'angwin',
 'anse',
 'antonia',
 'appearing',
 'aranjuez',
 'arguing',
 'arranging',
 'asaka',
 'asmp',
 'assyrians',
 'atlantis',
 'audacious',
 'authora',
 'avis',
 'ayodhya',
 'bacharach',
 'bagher',
 'balearic',
 'bancarella',
 'baranovichi',
 'barons',
 'basically',
 'bavaria',
 'beaudoin',
 'beijing',
 'bemidji',
 'bergamini',
 'bessler',
 'bhargava',
 'bigname',
 'biographies',
 'bitchin',
 'blanchet',
 'blondinbella',
 'boardon',
 'bojan',
 'bonnier',
 'borne',
 'bousman',
 'branca',
 'breisgau',
 'bristol',
 'broten',
 'bu',
 'bullion',
 'burtch',
 'bziers',
 'calderons',
 'cam

In [17]:
idf_values = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
idf_values

{'00': 7.772169299003679,
 '000': 7.792788586206415,
 '0001': 9.872230127886251,
 '001': 9.36140450412026,
 '002': 9.58454805543447,
 '003': 9.179082947326306,
 '004': 9.872230127886251,
 '005': 9.872230127886251,
 '006': 9.266094324315935,
 '007': 8.668257323560315,
 '008': 9.872230127886251,
 '009': 10.054551684680206,
 '01': 7.38732347809825,
 '010': 9.36140450412026,
 '011': 9.36140450412026,
 '012': 9.024932267499047,
 '013': 10.054551684680206,
 '014': 9.718079448058992,
 '015': 9.36140450412026,
 '016': 9.718079448058992,
 '017': 10.054551684680206,
 '018': 10.054551684680206,
 '02': 7.520854870722773,
 '026': 10.054551684680206,
 '03': 7.772169299003679,
 '030': 9.718079448058992,
 '036': 10.054551684680206,
 '04': 7.857327107343986,
 '040': 9.872230127886251,
 '05': 7.9263199788309375,
 '050': 9.872230127886251,
 '06': 8.05307168447008,
 '07': 8.23000239262916,
 '0708': 10.277695235994415,
 '071': 9.872230127886251,
 '075': 9.872230127886251,
 '077': 10.054551684680206,
 '08':

In [18]:
vectorizer.vocabulary_

{'digby': 26542,
 'morrell': 60711,
 'born': 14798,
 '10': 41,
 'october': 65047,
 '1979': 1259,
 'australian': 9766,
 'rules': 77734,
 'footballer': 34842,
 'played': 70002,
 'kangaroos': 48165,
 'carlton': 17547,
 'football': 34839,
 'league': 52117,
 'western': 96873,
 'australia': 9752,
 'early': 29123,
 'senior': 81002,
 'west': 96844,
 'perth': 68999,
 'career': 17444,
 'falcons': 32922,
 'spanned': 84738,
 '19982000': 1879,
 'clubs': 20152,
 'leading': 52104,
 'goalkicker': 38535,
 '2000': 1994,
 'age': 5194,
 '21': 2649,
 'recruited': 74535,
 'club': 20135,
 'round': 77378,
 'selection': 80743,
 '2001': 2039,
 'afl': 5092,
 'rookie': 77140,
 'draft': 28159,
 'forward': 35134,
 'twice': 92368,
 'kicked': 49302,
 'goals': 38541,
 'time': 90041,
 'losing': 54364,
 'cause': 18032,
 'sydney': 87563,
 '2002': 2085,
 'following': 34776,
 'season': 80344,
 'drawn': 28264,
 'game': 36688,
 '2003': 2126,
 'traded': 91153,
 'david': 24843,
 'teague': 88481,
 'exchange': 32307,
 'corey': 2

## Cosine Similarity

In [19]:
cosine_similarity([[0,1,2,3]], [[2,0,0,0], [0,1,1,0], [0,1,1,1]])

array([[0.        , 0.56694671, 0.9258201 ]])

In [20]:
df[['Biden' in x for x in df['name']]]

Unnamed: 0,URI,name,text
24478,<http://dbpedia.org/resource/Joe_Biden>,Joe Biden,joseph robinette joe biden jr dosf rbnt badn b...
29590,<http://dbpedia.org/resource/Jill_Biden>,Jill Biden,jill tracy biden ne jacobs previously stevenso...


In [21]:
index = 24478
print(df['name'][index])
main = X_matrix[index]

df['score'] = cosine_similarity(main, X_matrix)[0]

Joe Biden


In [22]:
df.sort_values(by='score',ascending=False).head()

Unnamed: 0,URI,name,text,score
24478,<http://dbpedia.org/resource/Joe_Biden>,Joe Biden,joseph robinette joe biden jr dosf rbnt badn b...,1.0
29590,<http://dbpedia.org/resource/Jill_Biden>,Jill Biden,jill tracy biden ne jacobs previously stevenso...,0.472481
16880,<http://dbpedia.org/resource/Cynthia_Hogan>,Cynthia Hogan,cynthia c hogan born cincinnati ohio about 195...,0.407093
35811,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,0.339517
23309,<http://dbpedia.org/resource/Chris_Coons>,Chris Coons,christopher andrew chris coons born september ...,0.273705


In [23]:
df.sort_values(by='score',ascending=False).head()['text'].values

array(['joseph robinette joe biden jr dosf rbnt badn born november 20 1942 is the 47th and current vice president of the united states jointly elected with president barack obama he is a member of the democratic party and was a united states senator from delaware from january 3 1973 until his resignation on january 15 2009 following his election to the vice presidency in 2012 biden was elected to a second term alongside obamabiden was born in scranton pennsylvania and lived there for ten years before moving to delaware he became an attorney in 1969 and was elected to the new castle county council in 1970 biden was first elected to the senate in 1972 and became the sixthyoungest senator in us history he was reelected to the senate six times and was the fourth most senior senator at the time of his resignation biden was a longtime member and former chairman of the foreign relations committee his strong advocacy helped bring about us military assistance and intervention during the bosnian

In [24]:
df.head()

Unnamed: 0,URI,name,text,score
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,0.010675
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,0.006669
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,0.009943
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,0.013306
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,0.001138


In [25]:
df = pd.read_csv('./data/people_wiki.csv')

In [26]:
df.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [27]:
index = df[df['name'] == 'David Beckham'].index[0]

X = X_matrix[ index ]

values = cosine_similarity( X, X_matrix)[0]
ps_values = pd.Series(data=values, index=df.index)


In [28]:
ps_values.sort_values(ascending=False, inplace=True)
ps_values

23386    1.000000
24913    0.278273
26756    0.249382
38666    0.238645
24258    0.234768
           ...   
20586    0.000000
9550     0.000000
28649    0.000000
33741    0.000000
4725     0.000000
Length: 42786, dtype: float64

In [29]:
df.loc[ps_values.index]

Unnamed: 0,URI,name,text
23386,<http://dbpedia.org/resource/David_Beckham>,David Beckham,david robert joseph beckham obe bkm born 2 may...
24913,<http://dbpedia.org/resource/Bobby_Charlton>,Bobby Charlton,sir robert bobby charlton cbe born 11 october ...
26756,<http://dbpedia.org/resource/Wayne_Rooney>,Wayne Rooney,wayne mark rooney runi born 24 october 1985 is...
38666,<http://dbpedia.org/resource/Shay_Given>,Shay Given,shay john james given born 20 april 1976 is an...
24258,<http://dbpedia.org/resource/Sol_Campbell>,Sol Campbell,sulzeer jeremiah sol campbell born 18 septembe...
...,...,...,...
20586,<http://dbpedia.org/resource/Mohammed_Faizal_P...,Mohammed Faizal P. P.,pp mohammed faizal is an indian politician bel...
9550,<http://dbpedia.org/resource/Peter_Hobson>,Peter Hobson,r p hobson or peter hobson is a professor of d...
28649,<http://dbpedia.org/resource/Jesper_Mogensen>,Jesper Mogensen,jesper mogensen is a danish neuroscientist who...
33741,<http://dbpedia.org/resource/Yossi_Matias>,Yossi Matias,yossi matias is an israeli computer scientist ...


In [30]:
def find_similar_people(name):
    indexes = df[df['name'] == name].index
    if len(indexes) == 0:
        print('no one found')
        return
    
    index = indexes[0]
    X = X_matrix[ index ]

    values = cosine_similarity( X, X_matrix)[0]
    ps_values = pd.Series(data=values, index=df.index)
    ps_values.sort_values(ascending=False, inplace=True)

    return df.loc[ps_values.index]

In [31]:
find_similar_people('Tim Burton')

Unnamed: 0,URI,name,text
2001,<http://dbpedia.org/resource/Tim_Burton>,Tim Burton,timothy walter tim burton brtn born august 25 ...
18180,<http://dbpedia.org/resource/Helena_Bonham_Car...,Helena Bonham Carter,helena bonham carter cbe born 26 may 1966 is a...
13178,<http://dbpedia.org/resource/Kristen_Stewart>,Kristen Stewart,kristen jaymes stewart born april 9 1990 is an...
34701,<http://dbpedia.org/resource/Amanda_Seyfried>,Amanda Seyfried,amanda michelle seyfried safrd syfred born dec...
2163,<http://dbpedia.org/resource/M._K._Hobson>,M. K. Hobson,m k hobson born january 21 1969 is a speculati...
...,...,...,...
34711,<http://dbpedia.org/resource/Frank_Walker_(Aus...,Frank Walker (Australian rules footballer),frank dickie walker was an australian rules fo...
38984,<http://dbpedia.org/resource/Bruce_Lahn>,Bruce Lahn,bruce lahn is the william b graham professor o...
34747,<http://dbpedia.org/resource/Anna_Vainikka>,Anna Vainikka,anne vainikka is a linguist specialising in th...
16485,<http://dbpedia.org/resource/Tamar_Ross>,Tamar Ross,tamar ross is a professor of jewish philosophy...


In [32]:
df[ df['name'].str.contains('Yang')]

Unnamed: 0,URI,name,text
4805,<http://dbpedia.org/resource/Yang_Chunlin>,Yang Chunlin,yang chunlin chinese pinyin yng chnln born 195...
9726,<http://dbpedia.org/resource/Ng_Chee_Yang>,Ng Chee Yang,cheeyang ng simplified chinese traditional chi...
10100,<http://dbpedia.org/resource/Shu_Yang>,Shu Yang,shu yang born 1969 china a chinese painter per...
10635,<http://dbpedia.org/resource/Yang_Ji-won_(acad...,Yang Ji-won (academic),yang jiwon born 1949 is a professor in departm...
13738,<http://dbpedia.org/resource/Yang_Yansheng>,Yang Yansheng,yang yansheng simplified chinese born 5 januar...
20452,<http://dbpedia.org/resource/Serena_Yang>,Serena Yang,serena yang is an american television journali...
21131,<http://dbpedia.org/resource/Yang_Yanyin>,Yang Yanyin,yang yanyin chinese december 1947 is a politic...
28717,<http://dbpedia.org/resource/Shang_Yang_(artist)>,Shang Yang (artist),shang yang born 1942 kaixian sichuan province ...
30578,<http://dbpedia.org/resource/Yilun_Yang>,Yilun Yang,yilun yang also spelled yilun yang is a 7 dan ...
30728,<http://dbpedia.org/resource/Yangjin_Pak>,Yangjin Pak,yangjin pak or pak yangjin is an archaeologist...
