In [1]:
import pandas as pd
import datetime
import re
import ipywidgets as widgets
from Corpus import Corpus

In [4]:
df = pd.read_csv("discours_US.csv", sep="\t")
df.head()

Unnamed: 0,speaker,text,date,descr,link
0,CLINTON,": I'm getting ready for a lot of things, a lot...","April 12, 2015",Video Remarks Announcing Candidacy for President,http://www.presidency.ucsb.edu/ws/index.php?pi...
1,CLINTON,"[ ] : I'll be graduating in May, and on gradua...","April 14, 2015",Remarks in a Question and Answer Session at Ki...,http://www.presidency.ucsb.edu/ws/index.php?pi...
2,CLINTON,": Well, thank you all so much for inviting me ...","April 20, 2015","Remarks in Keene, New Hampshire",http://www.presidency.ucsb.edu/ws/index.php?pi...
3,CLINTON,Thank you so much. I am absolutely delighted t...,"April 29, 2015",Address to the David N. Dinkins Leadership & P...,http://www.presidency.ucsb.edu/ws/index.php?pi...
4,CLINTON,"Oh, hello. Hi, how are you? Well, it's wonderf...","May 5, 2015",Remarks at a Roundtable with Young Nevada Resi...,http://www.presidency.ucsb.edu/ws/index.php?pi...


In [5]:
df["speaker"].value_counts()


speaker
CLINTON    93
TRUMP      71
Name: count, dtype: int64

In [6]:
corpus = Corpus("Discours US")

for index, row in df.iterrows():
    formats = ["%B %d, %Y", "%d/%m/%Y", "%Y-%m-%d"]
    date_obj = None

    for format in formats:
        try:
            date_obj = datetime.datetime.strptime(row["date"], format)
            break
        except ValueError:
            pass

    if date_obj is None:
        raise ValueError(f"Format de date inconnu : {row["date"]}")
    
    phrases = re.split(r"[.!?]", str(row["text"]))

    for p in phrases:
        p = p.strip()
        if len(p) < 20:
            continue

        doc = type("Document", (), {})()
        doc.titre = row.get("title", "Discours")
        doc.auteur = row["speaker"]
        doc.date = date_obj
        doc.url = ""
        doc.texte = p
        doc.getType = lambda: "speech"

        corpus.add_document(doc)


In [7]:
corpus.search("freedom")[:10]

['...circle of freedom and oppor...',
 "...lt's Four Freedoms are a te...",
 "...ights and freedom is what's...",
 '... the Four Freedoms park tha...',
 '...k down on freedom of expres...',
 '...here is a Freedom of Inform...',
 '...here is a Freedom of Inform...',
 '...s greater freedom of moveme...',
 '...d the USA Freedom Act, whic...',
 '...nted more freedom, wanted t...']

In [8]:
corpus.concorde("america")

Unnamed: 0,Context de gauche,motif trouvé,Context de droite
0,...president\n,America,ns have fo....
1,...\nEveryday,America,ns need a ....
2,"...e strong,",America,is strong....
3,...know that,America,ns have co....
4,...f because,America,n families....
...,...,...,...
3209,... that the,America,n dream is....
3210,"...r, fairer",America,we seek\nI....
3211,...o so many,America,ns and peo....
3212,...t I am an,America,n\nAnd I st....


In [9]:
from SearchEngine import SearchEngine

engine = SearchEngine(corpus)



Nombre de mots du vocabulaire : 12147
Matrice TF construite.
IDF calculé.
Matrice TF-IDF construite.


In [10]:
engine.search("freedom", k=5)

Unnamed: 0,doc_id,titre,auteur,date,url,score
0,8816,Discours,CLINTON,2016-06-02,,0.519597
1,28407,Discours,CLINTON,2016-11-03,,0.500455
2,28822,Discours,CLINTON,2016-11-04,,0.499037
3,28931,Discours,CLINTON,2016-11-04,,0.49679
4,28145,Discours,CLINTON,2016-11-03,,0.479557


In [None]:
engine.search("america", k=10)

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [17]:
label = widgets.Label("Moteur de recherche")

text_query = widgets.Text(
    description="Requête :",
    placeholder="Entrez des mots-clés"
)

slider_k = widgets.IntSlider(
    value=5,
    min=1,
    max=20,
    description="Nb docs :"
)


In [19]:
dropdown_author = widgets.Dropdown(
    options=["Tous"] + sorted(corpus.authors.keys()),
    description="Auteur :"
)

In [20]:
controls = widgets.HBox([text_query, slider_k, dropdown_author])


In [21]:
output = widgets.Output()

In [22]:
button = widgets.Button(
    description="Rechercher",
    button_style="primary"
)

In [23]:
def clique_bouton(b):
    output.clear_output()

    query = text_query.value
    k = slider_k.value
    author = dropdown_author.value

    df = engine.search(query, k)

    if author != "Tous":
        df = df[df["auteur"] == author]

    with output:
        if df.empty:
            print("Aucun résultat trouvé.")
        else:
            display(df)


In [24]:
ui = widgets.VBox([
    label,
    controls,
    button,
    output
])

display(ui)


VBox(children=(Label(value='Moteur de recherche'), HBox(children=(Text(value='', description='Requête :', plac…