In [33]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [54]:
import os
from nltk.tokenize import PunktSentenceTokenizer
from collections import Counter
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def read_file(file_name):
  with open(file_name, 'r+', encoding='unicode_escape') as file:
    file_text = file.read()
  return file_text

def process_speeches(speeches):
  word_tokenized_speeches = list()
  for speech in speeches:
    sentence_tokenizer = PunktSentenceTokenizer()
    sentence_tokenized_speech = sentence_tokenizer.tokenize(speech)
    word_tokenized_sentences = list()
    for sentence in sentence_tokenized_speech:
      word_tokenized_sentence = [word.lower().strip('.').strip('?').strip('!') for word in sentence.replace(",","").replace("-"," ").replace(":","").split()]
      word_tokenized_sentences.append(word_tokenized_sentence)
    word_tokenized_speeches.append(word_tokenized_sentences)
  return word_tokenized_speeches

def merge_speeches(speeches):
  all_sentences = list()
  for speech in speeches:
    for sentence in speech:
      all_sentences.append(sentence)
  return all_sentences

def get_president_sentences(president):
  files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
  speeches = [read_file(file) for file in files]
  processed_speeches = process_speeches(speeches)
  all_sentences = merge_speeches(processed_speeches)
  return all_sentences

def get_presidents_sentences(presidents):
  all_sentences = list()
  for president in presidents:
    files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
    speeches = [read_file(file) for file in files]
    processed_speeches = process_speeches(speeches)
    all_prez_sentences = merge_speeches(processed_speeches)
    all_sentences.extend(all_prez_sentences)
  return all_sentences

def most_frequent_words(list_of_sentences):
  all_words = [word for sentence in list_of_sentences for word in sentence]
  SW_removed = []
  for word in all_words:
    if word not in stop_words:
        SW_removed.append(word)
  return Counter(SW_removed).most_common()

In [60]:
import os
import gensim
import spacy
from gensim import models
from gensim.models import Word2Vec

#from president_helper import read_file, process_speeches, merge_speeches, get_president_sentences, get_presidents_sentences, most_frequent_words

# get list of all speech files
files = sorted([file for file in os.listdir() if file[-4:] == '.txt'])
#print(files)
# read each speech file
speeches = [read_file(item) for item in files]
#print(speeches[0])

# preprocess each speech
processed_speeches = process_speeches(speeches) 
#print(processed_speeches[0][0][:6])
# merge speeches
all_sentences = merge_speeches(processed_speeches)


# view most frequently used words
most_freq_words = most_frequent_words(all_sentences)
#print(most_freq_words)

# create gensim model of all speeches
all_prez_embeddings =  gensim.models.Word2Vec(all_sentences ,min_count=1, vector_size = 96 , window=5,workers=2, sg=1)
similar_to_freedom = all_prez_embeddings.wv.most_similar("freedom", topn=20 ) 
print(similar_to_freedom)

# view words similar to freedom
#print(similar_to_freedom)
#print(most_freq_words[15])
imilar_to_human = all_prez_embeddings.wv.most_similar("human", topn=20)
#print(imilar_to_human)
# get President Roosevelt sentences

roosevelt_sentences = get_president_sentences("Roosevelt")

# view most frequently used words of Roosevelt

roosevelt_most_freq_words = most_frequent_words(roosevelt_sentences)
#print(roosevelt_most_freq_words)

# create gensim model for Roosevelt
roosevelt_embeddings = gensim.models.Word2Vec(roosevelt_sentences, vector_size=96, window=5, min_count=1, workers=2, sg=1)
# view words similar to freedom for Roosevelt
roosevelt_similar_to_freedom = roosevelt_embeddings.wv.most_similar("freedom", topn=20)
print(roosevelt_similar_to_freedom)

# get sentences of multiple presidents
rushmore_prez_sentences = get_presidents_sentences(["washington","jefferson","lincoln","Kennedy"])


# view most frequently used words of presidents
rushmore_most_freq_words = most_frequent_words(rushmore_prez_sentences)
print(rushmore_most_freq_words[120: 180])

# create gensim model for the presidents
rushmore_embeddings = gensim.models.Word2Vec(rushmore_prez_sentences, vector_size=96, window=5, min_count=1, workers=2, sg=1)


# view words similar to freedom for presidents
rushmore_similar_to_freedom = rushmore_embeddings.wv.most_similar("freedom", topn=20)
print(rushmore_similar_to_freedom)
# what words the P's spoke on rushmore similar to ["confidence, 'justice"]
rushmore_similar_justice= rushmore_embeddings.wv.most_similar("justice", topn=20)
rushmore_similar_confidence= rushmore_embeddings.wv.most_similar("confidence", topn=20)

print(rushmore_similar_justice)
print(rushmore_similar_confidence)


[('economic', 0.9637851715087891), ('human', 0.9571189284324646), ('independence', 0.9544987678527832), ('security', 0.9542917013168335), ('individual', 0.9540867209434509), ('condition', 0.9540834426879883), ('parties', 0.9539456963539124), ('domestic', 0.9529170989990234), ('business', 0.9525957703590393), ('growth', 0.9521768093109131), ('life', 0.9515205025672913), ('wealth', 0.9504293203353882), ('prosperity', 0.9482309222221375), ('dignity', 0.9481265544891357), ('progress', 0.9478520750999451), ('department', 0.9472221732139587), ('race', 0.9469015598297119), ('influence', 0.9467835426330566), ('maintenance', 0.9466908574104309), ('order', 0.9464203715324402)]
[('government', 0.9971845149993896), ('from', 0.9971229434013367), ('that', 0.9970899820327759), ('on', 0.997053325176239), ('i', 0.9969252943992615), ('our', 0.9969114661216736), ('but', 0.9969032406806946), ('so', 0.9968989491462708), ('an', 0.9968894720077515), ('shall', 0.9968706369400024), ('and', 0.996863603591919), 