# Quote Extractor
The below code can be used to extract quotes from your texts. List the text names in an excel spreadsheet and store all your text files (.txt) in the input folder, and you can start running this notebook to extract quotes from your texts.

In [1]:
# import the necessary packages
import pandas as pd
import os
import sys
import logging
import traceback
import spacy
from nltk import Tree

# import the quote extractor
from quote_extractor import extract_quotes, get_rawtext_files
from config import config
import utils

# initiate the app_logger
app_logger = utils.create_logger('quote_extractor', log_dir='logs', logger_level=logging.INFO, 
                                 file_log_level=logging.INFO)

In [2]:
# download spacy's en_core_web_lg
print("Loading spaCy language model...")
nlp = spacy.load('en_core_web_lg')
print("Finished loading")

Loading spaCy language model...
Finished loading


In [3]:
# enter the file path and the file name containing the list of texts you wish to extract the quote from
# please ensure that the .txt files are located in the same path
file_path = './input/'
file_name = 'text_file_list.xlsx'

# read the pandas dataframe containing the list of texts
text_df = pd.read_excel(file_path + file_name, index_col=0)
text_df.head()

Unnamed: 0_level_0,text_files,description
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,test1.txt,random
1,test2.txt,CBC news


In [4]:
# specify whether you wish to create a parse tree for the quotes and specify the file path if 'True'
write_quote_trees_in_file = False
tree_dir = './output/trees/'

# begin to extract quotes from your text files
all_quotes = []
for input_file in text_df['text_files']:
    doc_id = input_file.replace(".txt", "")
    
    try:
        doc_lines = open(os.path.join(file_path, input_file), 'r').readlines()
        doc_lines = [line.rstrip() for line in doc_lines if line!='\n']
        doc_text = '\n'.join(doc_lines)
        doc_text = utils.preprocess_text(doc_text)
        doc = nlp(doc_text)
        quotes = extract_quotes(doc_id=doc_id, doc=doc, write_tree=write_quote_trees_in_file, tree_dir=tree_dir)
        for n, quote in enumerate(quotes):
            quote['quote_id']=doc_id + '-' + str(n+1)
        all_quotes.extend(quotes)
            
    except:
        app_logger.exception("message")
        traceback.print_exc()

In [5]:
# generate the outcome in a pandas dataframe
quotes_df = pd.DataFrame.from_dict(all_quotes)
new_index = ['quote_id', 'quote', 'quote_index', 'quote_token_count', 'quote_type','is_floating_quote', 
             'speaker', 'speaker_index', 'verb', 'verb_index']
quotes_df = quotes_df.reindex(columns=new_index)
quotes_df

Unnamed: 0,quote_id,quote,quote_index,quote_token_count,quote_type,is_floating_quote,speaker,speaker_index,verb,verb_index
0,test1-1,that Trump wouldn't be able to post for 24 hou...,"(84,173)",17,SVC,False,"Facebook and Instagram, which Facebook owns","(0,43)",announcing,"(73,83)"
1,test1-2,that these actions follow years of hemming and...,"(303,490)",26,SVC,False,experts,"(289,296)",noted,"(297,302)"
2,test1-3,"what happened in Washington, D.C.","(594,627)",6,SVC,False,"Jennifer Grygiel, a Syracuse University commun...","(493,588)",said,"(589,593)"
3,test1-4,", on Wednesday is a direct result of Trump's u...","(627,815)",32,SVC,False,"Jennifer Grygiel, a Syracuse University commun...","(493,588)",said,"(589,593)"
4,test1-5,"This is what happens,","(1018,1039)",5,QCQSV,False,""" Grygiel","(1039,1048)",said,"(1049,1053)"
5,test1-6,"They're creeping along towards firmer action,","(1358,1403)",8,QCQSV,False,""" Grygiel","(1403,1412)",said,"(1413,1417)"
6,test1-7,"that the video was removed because it ""contrib...","(2373,2478)",18,SVC,False,"Guy Rosen, Facebook's vice-president of integrity","(2296,2345)",said,"(2347,2351)"
7,test1-8,This is an emergency situation and we are taki...,"(2484,2607)",19,QCQSV,False,Rosen,"(2610,2615)",said,"(2616,2620)"
8,test1-9,"""I know your pain","(2817,2834)",5,SVC,False,Trump,"(2786,2791)",saying,"(2809,2815)"
9,test1-10,"""We can't play into the hands of these people","(2979,3024)",11,SVC,False,Trump,"(2957,2962)",say,"(2974,2977)"


In [6]:
# save the above dataframe into excel
quotes_df.to_excel('./output/quotes.xlsx', index=False)