This notebook is to create dataframes with sentences that contain a specific term, together with metadata.
Currently only works with single word terms. You need to have an index of the Coha corpus, as created with the script `create_coha_index.py`.

In [2]:
import whoosh.index
from whoosh.qparser import QueryParser
import pandas as pd
import os

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
index_dir = "C:\\Users\\DafnevanKuppevelt\\surfdrive\\VBox Shared\\SemSus\\coha-index"
ix = whoosh.index.open_dir(index_dir)

In [5]:
searcher = ix.searcher()

In [6]:
qp = QueryParser("content", schema=ix.schema)

In [27]:
def get_matching_sentences(key, content):
    result = []
    sentences = sent_tokenize(content)
    for sent in sentences:
        tokens = word_tokenize(sent)
        if key in tokens:
            result.append(sent)
    return result

In [28]:
def query_to_dataframe(query):
    results = searcher.search(qp.parse(query), limit=None)
    print('{}: {} results'.format(query, len(results)))
    df_results = pd.DataFrame()
    for r in results:
        df_results = df_results.append(pd.DataFrame({
                  'year':r['year'], 
                  'genre': r['genre'],
                  'id': r['id'],  
                  'sentence': get_matching_sentences(query, r['content'])}))
    return df_results

In [29]:
output_path = '../../sentences_out'
if not os.path.exists(output_path):
    os.mkdir(output_path)

queries = ["fair", "sustainable", "green"]
for query in queries:
    print(query)
    df = query_to_dataframe(query)
    df.to_csv(os.path.join(output_path, query+'.csv'), index=False)

fair
fair: 165 results
sustainable
sustainable: 1 results
green
green: 160 results
