# Quote Extractor
The below code can be used to extract quotes from your text. 

In [3]:
# import the necessary packages
import pandas as pd
import os
import sys
import logging
import spacy
from nltk import Tree
from config import config
import utils

# import the quote extractor
from quote_extractor import extract_quotes, get_rawtext_files

In [None]:
# download spacy's en_core_web_lg
print("Loading spaCy language model...")
nlp = spacy.load('en_core_web_lg')
print("Finished loading")

In [11]:
# enter the file path of the file you wish to extract quotes from
file_path = './input/'
file_name = 'text_file_list.xlsx'

# read the pandas dataframe
text_df = pd.read_excel(file_path + file_name, index_col=0)
text_df

Unnamed: 0_level_0,text_files
index,Unnamed: 1_level_1
0,test1.txt
1,test2.txt


In [4]:
# specify whether to create a parse tree for the quotes and specify the file path if 'True'
write_quote_trees_in_file = False
tree_dir = './output/'

for input_file in get_rawtext_files(file_path):
    doc_id = input_file.replace(".txt", "")
    
    try:
        doc_lines = open(os.path.join(file_path, input_file), 'r').readlines()
        doc_lines = [line.rstrip() for line in doc_lines if line!='\n']
        doc_text = '\n'.join(doc_lines)
        doc_text = utils.preprocess_text(doc_text)
        doc = nlp(doc_text)
        quotes = extract_quotes(doc_id=doc_id, doc=doc, write_tree=write_quote_trees_in_file, tree_dir=tree_dir)
            
    except:
        app_logger.exception("message")
        traceback.print_exc()

In [5]:
# generate preview of the quotes
for n, q in enumerate(quotes[:3]):
    print('Quote number:',n)
    for key, value in q.items():
        print(key.title() + ': ' + str(value))
    print('-' * 50)

Quote number: 0
Speaker: Facebook and Instagram, which Facebook owns
Speaker_Index: (0,43)
Quote: that Trump wouldn't be able to post for 24 hours following two violations of its policies
Quote_Index: (84,173)
Verb: announcing
Verb_Index: (73,83)
Quote_Token_Count: 17
Quote_Type: SVC
Is_Floating_Quote: False
--------------------------------------------------
Quote number: 1
Speaker: experts
Speaker_Index: (289,296)
Quote: that these actions follow years of hemming and hawing regarding Trump and his supporters spreading dangerous misinformation and encouraging violence that contributed to Wednesday's events
Quote_Index: (303,490)
Verb: noted
Verb_Index: (297,302)
Quote_Token_Count: 26
Quote_Type: SVC
Is_Floating_Quote: False
--------------------------------------------------
Quote number: 2
Speaker: Jennifer Grygiel, a Syracuse University communications professor and an expert on social media,
Speaker_Index: (493,588)
Quote: what happened in Washington, D.C.
Quote_Index: (594,627)
Verb:

In [6]:
# generate the outcome in a pandas dataframe
quotes_df = pd.DataFrame.from_dict(quotes)
quotes_df

Unnamed: 0,is_floating_quote,quote,quote_index,quote_token_count,quote_type,speaker,speaker_index,verb,verb_index
0,False,that Trump wouldn't be able to post for 24 hou...,"(84,173)",17,SVC,"Facebook and Instagram, which Facebook owns","(0,43)",announcing,"(73,83)"
1,False,that these actions follow years of hemming and...,"(303,490)",26,SVC,experts,"(289,296)",noted,"(297,302)"
2,False,"what happened in Washington, D.C.","(594,627)",6,SVC,"Jennifer Grygiel, a Syracuse University commun...","(493,588)",said,"(589,593)"
3,False,", on Wednesday is a direct result of Trump's u...","(627,815)",32,SVC,"Jennifer Grygiel, a Syracuse University commun...","(493,588)",said,"(589,593)"
4,False,"This is what happens,","(1018,1039)",5,QCQSV,""" Grygiel","(1039,1048)",said,"(1049,1053)"
5,False,"They're creeping along towards firmer action,","(1358,1403)",8,QCQSV,""" Grygiel","(1403,1412)",said,"(1413,1417)"
6,False,"that the video was removed because it ""contrib...","(2373,2478)",18,SVC,"Guy Rosen, Facebook's vice-president of integrity","(2296,2345)",said,"(2347,2351)"
7,False,This is an emergency situation and we are taki...,"(2484,2607)",19,QCQSV,Rosen,"(2610,2615)",said,"(2616,2620)"
8,False,"""I know your pain","(2817,2834)",5,SVC,Trump,"(2786,2791)",saying,"(2809,2815)"
9,False,"""We can't play into the hands of these people","(2979,3024)",11,SVC,Trump,"(2957,2962)",say,"(2974,2977)"


In [5]:
# save the above dataframe into excel
quotes_df.to_excel('./output/quotes.xlsx', index=False)