In [1]:
from itertools import count
from sysconfig import get_platform

import numpy as np
import pandas as pd

In [2]:
us_data = pd.read_csv("data/discours_US.csv", sep="\t")

us_data.head()

Unnamed: 0,speaker,text,date,descr,link
0,CLINTON,": I'm getting ready for a lot of things, a lot...","April 12, 2015",Video Remarks Announcing Candidacy for President,http://www.presidency.ucsb.edu/ws/index.php?pi...
1,CLINTON,"[ ] : I'll be graduating in May, and on gradua...","April 14, 2015",Remarks in a Question and Answer Session at Ki...,http://www.presidency.ucsb.edu/ws/index.php?pi...
2,CLINTON,": Well, thank you all so much for inviting me ...","April 20, 2015","Remarks in Keene, New Hampshire",http://www.presidency.ucsb.edu/ws/index.php?pi...
3,CLINTON,Thank you so much. I am absolutely delighted t...,"April 29, 2015",Address to the David N. Dinkins Leadership & P...,http://www.presidency.ucsb.edu/ws/index.php?pi...
4,CLINTON,"Oh, hello. Hi, how are you? Well, it's wonderf...","May 5, 2015",Remarks at a Roundtable with Young Nevada Resi...,http://www.presidency.ucsb.edu/ws/index.php?pi...


In [3]:
print(f"Trump values : {us_data['speaker'].value_counts()['TRUMP'] / us_data.shape[0]}")
print(f"Clinton values : {us_data['speaker'].value_counts()['CLINTON'] / us_data.shape[0]}")

Trump values : 0.4329268292682927
Clinton values : 0.5670731707317073


In [4]:
# import speech into corpus

from Corpus import Corpus

corpus = Corpus("US_speeches")

from Document import Document
from Author import Author

counter = 0
trump_author = Author("TRUMP")
clinton_author = Author("CLINTON")
for i, row in us_data.iterrows():
    for sentence in row['text'].split("."):
        if row['speaker'] == "TRUMP":
            doc = Document(f"Sentence-{counter}", trump_author, row['date'], row['link'], sentence)
        else:
            doc = Document(f"Sentence-{counter}", clinton_author, row['date'], row['link'], sentence)
        corpus.add(doc)
        counter += 1

In [5]:
# import search engine
from SearchEngine import SearchEngine

google = SearchEngine(corpus)

In [6]:
google.search("health")

Searching: 33268it [00:00, 4027829.73it/s]


Unnamed: 0,Body,Score,Title,Author,Date,URL,Document
0,"And as first lady, to fight for health care r...",2,Sentence-63,CLINTON,"April 14, 2015",http://www.presidency.ucsb.edu/ws/index.php?pi...,"Sentence-63 CLINTON And as first lady, to fig..."
1,"And we don't have enough resources, so that i...",2,Sentence-168,CLINTON,"April 20, 2015",http://www.presidency.ucsb.edu/ws/index.php?pi...,Sentence-168 CLINTON And we don't have enough...
2,"We have treatment in the Affordable Care Act,...",2,Sentence-171,CLINTON,"April 20, 2015",http://www.presidency.ucsb.edu/ws/index.php?pi...,Sentence-171 CLINTON We have treatment in the...
3,We claim we're now going to be able to help p...,2,Sentence-959,CLINTON,"May 18, 2015",http://www.presidency.ucsb.edu/ws/index.php?pi...,Sentence-959 CLINTON We claim we're now going...
4,You can read about what I did when I was firs...,2,Sentence-3843,CLINTON,"September 17, 2015",http://www.presidency.ucsb.edu/ws/index.php?pi...,Sentence-3843 CLINTON You can read about what...
...,...,...,...,...,...,...,...
227,She started giving her daughter the best heal...,1,Sentence-31519,CLINTON,"November 4, 2016",http://www.presidency.ucsb.edu/ws/index.php?pi...,Sentence-31519 CLINTON She started giving her...
228,"And when I was in North Carolina, I heard abo...",1,Sentence-31520,CLINTON,"November 4, 2016",http://www.presidency.ucsb.edu/ws/index.php?pi...,Sentence-31520 CLINTON And when I was in Nort...
229,And I spent my time helping to rebuild New Yo...,1,Sentence-31525,CLINTON,"November 4, 2016",http://www.presidency.ucsb.edu/ws/index.php?pi...,Sentence-31525 CLINTON And I spent my time he...
230,You're going to have such great health care,1,Sentence-31844,TRUMP,"November 7, 2016",http://www.presidency.ucsb.edu/ws/index.php?pi...,Sentence-31844 TRUMP You're going to have suc...


In [7]:
# google.better_search("health")

In [8]:
import ipywidgets as widgets
from IPython.display import display

label = widgets.Label(value="Search for a term in the corpus")

search_box = widgets.Text(
    value='health',
    placeholder='Type something',
    description='Search:',
    disabled=False,
)

slider = widgets.IntSlider(
    value=10,
    min=0,
    max=20,
    step=1,
    description='Search results:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
    style={'description_width': 'initial'}
)

button = widgets.Button(description="Search")

search_strength = widgets.IntSlider(
    value=1,
    min=1,
    max=3,
    step=1,
    description='Search strength:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
    style={'description_width': 'initial'}
)

k_label = widgets.Label(value="keyword importance")

k = widgets.FloatSlider(
    value=1.5,
    min=0,
    max=2,
    step=0.1,
    description='k:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.1f',
    style={'description_width': 'initial'}
)

b_label = widgets.Label(value="prefer shorter documents")

b = widgets.FloatSlider(
    value=0.65,
    min=0,
    max=1,
    step=0.05,
    description='b:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.2f',
    style={'description_width': 'initial'}
)

simple_output = widgets.Checkbox(
    value=False,
    description='Simple Output',
    disabled=False
)

output = widgets.Output()


def on_button_clicked(btn):
    with output:
        output.clear_output()
        if search_strength.value == 1:
            search_results = google.search(search_box.value)
        elif search_strength.value == 2:
            search_results = google.better_search(search_box.value)
        else:
            search_results = google.better_search_v2(search_box.value, k.value, b.value)

        if search_results.empty:
            display("No results found")

        with pd.option_context('display.max_colwidth', None):
            if slider.value == 0:
                if simple_output.value:
                    display(search_results[["Body", "Score"]])
                else:
                    display(search_results)
            else:
                if simple_output.value:
                    display(search_results.head(slider.value)[["Body", "Score"]])
                else:
                    display(search_results.head(slider.value))


button.on_click(on_button_clicked)

display(
    widgets.VBox([label, search_box, slider, search_strength, k_label, k, b_label, b, simple_output, button, output]))

VBox(children=(Label(value='Search for a term in the corpus'), Text(value='health', description='Search:', pla…