In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option("max_rows", 600)
from pathlib import Path  
import glob

In [2]:
directory_path = "inaugural"

In [3]:
text_files = glob.glob(f"{directory_path}/*.txt")

In [4]:
text_files

['inaugural\\1789-Washington.txt',
 'inaugural\\1793-Washington.txt',
 'inaugural\\1797-Adams.txt',
 'inaugural\\1801-Jefferson.txt',
 'inaugural\\1805-Jefferson.txt',
 'inaugural\\1809-Madison.txt',
 'inaugural\\1813-Madison.txt',
 'inaugural\\1817-Monroe.txt',
 'inaugural\\1821-Monroe.txt',
 'inaugural\\1825-Adams.txt',
 'inaugural\\1829-Jackson.txt',
 'inaugural\\1833-Jackson.txt',
 'inaugural\\1837-VanBuren.txt',
 'inaugural\\1841-Harrison.txt',
 'inaugural\\1845-Polk.txt',
 'inaugural\\1849-Taylor.txt',
 'inaugural\\1853-Pierce.txt',
 'inaugural\\1857-Buchanan.txt',
 'inaugural\\1861-Lincoln.txt',
 'inaugural\\1865-Lincoln.txt',
 'inaugural\\1869-Grant.txt',
 'inaugural\\1873-Grant.txt',
 'inaugural\\1877-Hayes.txt',
 'inaugural\\1881-Garfield.txt',
 'inaugural\\1885-Cleveland.txt',
 'inaugural\\1889-Harrison.txt',
 'inaugural\\1893-Cleveland.txt',
 'inaugural\\1897-McKinley.txt',
 'inaugural\\1901-McKinley.txt',
 'inaugural\\1905-Roosevelt.txt',
 'inaugural\\1909-Taft.txt',
 'ina

In [5]:
text_titles = [Path(text).stem for text in text_files]

In [6]:
text_titles

['1789-Washington',
 '1793-Washington',
 '1797-Adams',
 '1801-Jefferson',
 '1805-Jefferson',
 '1809-Madison',
 '1813-Madison',
 '1817-Monroe',
 '1821-Monroe',
 '1825-Adams',
 '1829-Jackson',
 '1833-Jackson',
 '1837-VanBuren',
 '1841-Harrison',
 '1845-Polk',
 '1849-Taylor',
 '1853-Pierce',
 '1857-Buchanan',
 '1861-Lincoln',
 '1865-Lincoln',
 '1869-Grant',
 '1873-Grant',
 '1877-Hayes',
 '1881-Garfield',
 '1885-Cleveland',
 '1889-Harrison',
 '1893-Cleveland',
 '1897-McKinley',
 '1901-McKinley',
 '1905-Roosevelt',
 '1909-Taft',
 '1913-Wilson',
 '1917-Wilson',
 '1921-Harding',
 '1925-Coolidge',
 '1929-Hoover',
 '1933-Roosevelt',
 '1937-Roosevelt',
 '1941-Roosevelt',
 '1945-Roosevelt',
 '1949-Truman',
 '1953-Eisenhower',
 '1957-Eisenhower',
 '1961-Kennedy',
 '1965-Johnson',
 '1969-Nixon',
 '1973-Nixon',
 '1977-Carter',
 '1981-Reagan',
 '1985-Reagan',
 '1989-Bush',
 '1993-Clinton',
 '1997-Clinton',
 '2001-Bush',
 '2005-Bush',
 '2009-Obama']

In [7]:
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english', encoding='iso-8859-1')

In [8]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [9]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names())



In [10]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [11]:
tfidf_slice = tfidf_df[['government', 'borders', 'people', 'war', 'honor','foreign', 'men', 'women', 'children']]
tfidf_slice.sort_index().round(decimals=2)

Unnamed: 0,government,borders,people,war,honor,foreign,men,women,children
00_Document Frequency,51.0,4.0,54.0,44.0,32.0,31.0,45.0,13.0,20.0
1789-Washington,0.12,0.0,0.05,0.0,0.0,0.0,0.02,0.0,0.0
1793-Washington,0.06,0.0,0.06,0.0,0.09,0.0,0.0,0.0,0.0
1797-Adams,0.16,0.0,0.19,0.01,0.1,0.12,0.05,0.0,0.0
1801-Jefferson,0.16,0.0,0.02,0.01,0.04,0.0,0.04,0.0,0.0
1805-Jefferson,0.03,0.0,0.0,0.04,0.0,0.06,0.01,0.0,0.02
1809-Madison,0.0,0.0,0.02,0.02,0.05,0.05,0.0,0.0,0.0
1813-Madison,0.04,0.0,0.04,0.25,0.02,0.02,0.0,0.0,0.0
1817-Monroe,0.18,0.0,0.11,0.09,0.01,0.1,0.04,0.0,0.0
1821-Monroe,0.08,0.0,0.07,0.11,0.02,0.04,0.01,0.0,0.01


In [12]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')

In [13]:
tfidf_df.stack().reset_index()

Unnamed: 0,level_0,level_1,0
0,1789-Washington,000,0.0
1,1789-Washington,100,0.0
2,1789-Washington,120,0.0
3,1789-Washington,125,0.0
4,1789-Washington,13,0.0
...,...,...,...
489827,2009-Obama,youthful,0.0
489828,2009-Obama,zeal,0.0
489829,2009-Obama,zealous,0.0
489830,2009-Obama,zealously,0.0


In [14]:
tfidf_df = tfidf_df.stack().reset_index()

In [15]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})

In [16]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

Unnamed: 0,document,term,tfidf
3587,1789-Washington,government,0.115524
3974,1789-Washington,immutable,0.10434
4041,1789-Washington,impressions,0.10434
6148,1789-Washington,providential,0.10434
6162,1789-Washington,public,0.103591
5454,1789-Washington,ought,0.103271
6200,1789-Washington,qualifications,0.09673
5930,1789-Washington,present,0.09652
5632,1789-Washington,peculiarly,0.090827
590,1789-Washington,article,0.086004


In [17]:
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

In [18]:
top_tfidf[top_tfidf['term'].str.contains('women')]

Unnamed: 0,document,term,tfidf
279812,1913-Wilson,women,0.10412
489740,2009-Obama,women,0.088218


In [19]:
top_tfidf[top_tfidf['document'].str.contains('honor')]

Unnamed: 0,document,term,tfidf


In [20]:
!pip install altair

Defaulting to user installation because normal site-packages is not writeable


In [21]:
import altair as alt
import numpy as np

# Terms in this list will get a red dot in the visualization
term_list = ['war', 'peace']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)

# Answer the following questions

1. What is the difference between a tf-idf score and raw word frequency?

Raw word frequency represents the number of times a word appears in a specific address, while TF-IDF considers the word's importance and rarity across all addresses, highlighting its significance within individual speeches.

2. Based on the dataframe above, what is one potential problem or limitation that you notice with tf-idf scores?

Tf-idf only focuses on the statistical properties of the dataframe not on the actual context or the semantic meaning of the word. This in turn may not refelct the actual meaning of the word or its context.

3. What’s another collection of texts that you think might be interesting to analyze with tf-idf scores? Why?

News articles, to predict hate speeches by ranking hateful words with the instances they have been used. Based on that crime as a result of a hateful speech can be detected and future hate crime incidents could be avoided.