# Getting and cleaning up file names.

This notebook has contains code to extract and clean filenames (the output of search using the *regex_search* function in *tf_idf_ipynb*).

## Extracting the filenames.

In [10]:
### Written by Søren Fomsgaard, July 2020 ###

from pathlib import Path
import pandas as pd
import os, re
from operator import itemgetter

In [14]:
# a dataframe to test the function.

test_df = pd.read_excel('multiple_results.xlsx', sheet_name='trimmed_results', ignore_index=True)

test_df

Unnamed: 0,file_id,text_length,beer,drunk,brandy,liquor,intoxicat,booz,drunkard,temperance,alcohol,terms_sum
0,1676-01-17_o16760117-1-defend16.txt,1106,1,0,0,0,0,0,0,0,0,1
1,1676-04-05_t16760405-10-victim20.txt,147,1,0,0,0,0,0,0,0,0,1
2,1676-08-23_t16760823-4-punish14.txt,275,0,1,0,0,0,0,0,0,0,1
3,1676-12-13_t16761213-5-verdict15.txt,88,0,1,0,0,0,0,0,0,0,1
4,1677-06-01_t16770601-1-verdict3.txt,135,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
28152,1913-04-01_t19130401-38-punishment-46.txt,690,2,2,0,0,0,0,0,0,0,4
28153,1913-04-01_t19130401-45-verdict-2.txt,1169,5,0,0,0,0,0,0,0,0,5
28154,1913-04-01_t19130401-46-punishment-51.txt,705,0,2,0,0,0,0,0,0,0,2
28155,1913-04-01_t19130401-47-punishment-52.txt,566,1,1,0,0,0,0,0,0,0,2


In [15]:
# a function to extract the filenames of found files (*trimmedOutput* from *regex_search*).

def filenameGetter(dataframe, copyfiles=False):
    """
    Get filenames from a dataframe. 
    Optional: copy files from sourcedir to targetdir.

    :param dataframe: a pandas dataframe.

    Keyword argument:
    copyfiles -- if True, perform copy given user input vars.
    """
    # compile re to extract names for SPAQRL query on fileid-ID only.
    clean_pattern = re.compile('(?<=_)\d*\w+-\w+|(?<=_)\d+') # 'negative lookbehind, wordcharacter-any number of digits OR just a string of numbers after an underscore.' BE AWARE THAT HTIS IS TAILOR-MADE to our search results.
    
    # check if user wants to copy files.
    if copyfiles == True:
        sourceDir = input('Input SOURCEDIR of files: ')
        targetDir = input('Input TARGETDIR of files: ')

    # get filenames from dataframe
    rawFilenames = dataframe['file_id'].tolist() #depending on whether or not file IS the index column..

    strippedFilenames = [match for match in [clean_pattern.findall(filename) for filename in rawFilenames]]

    filteredList  = list(filter(None, strippedFilenames)) #filter out empty 'matches',

     #print(len(rawFilenames))
     #print(len(strippedFilenames)) ## this is just to check that we have the exact same number of filenames extracted as there are filenames in the results

    
    
    return sorted(filteredList, key=itemgetter(0)) # return a sorted list of the final result.

    

In [16]:
# function call for testing
files_to_sparql = filenameGetter(test_df)

In [17]:
files_to_sparql

[['16781211'],
 ['a17081013-1'],
 ['f18280221-1'],
 ['f18291029-1'],
 ['o16760117-1'],
 ['o17710911-1'],
 ['o17860111-1'],
 ['o17880625-1'],
 ['s16800910a-1'],
 ['s16810413a-1'],
 ['t16760405-10'],
 ['t16760823-4'],
 ['t16761213-5'],
 ['t16770601-1'],
 ['t16770601-4'],
 ['t16800117-1'],
 ['t16800910-6'],
 ['t16801208-3'],
 ['t16820426-14'],
 ['t16820712-6'],
 ['t16830524-4'],
 ['t16831010a-17'],
 ['t16840702-22'],
 ['t16840903-18'],
 ['t16841210-15'],
 ['t16850604-34'],
 ['t16851014-4'],
 ['t16851209-14'],
 ['t16870512-36'],
 ['t16870512-7'],
 ['t16870701-12'],
 ['t16870701-19'],
 ['t16871207-21'],
 ['t16871207-40'],
 ['t16880222-15'],
 ['t16880425-12'],
 ['t16880711-20'],
 ['t16880831-17'],
 ['t16890828-21'],
 ['t16901210-20'],
 ['t16910115-18'],
 ['t16920629-19'],
 ['t16920629-32'],
 ['t16921012-9'],
 ['t16941010-11'],
 ['t16941010-20'],
 ['t16941010-23'],
 ['t16941010-4'],
 ['t16950403-9'],
 ['t16950828-13'],
 ['t16951014-2'],
 ['t16951014-32'],
 ['t16960227-36'],
 ['t16980608-56'],

In [18]:
# re for the Victorian Era

ve = '[a-z]*18[2-9]\d*-*\w+|\[a-z]*19\d+-*\w+'

In [24]:
# read the files
with open('files_to_sparql.txt', 'r') as infile:
        with open('VE_drunk_files_to_sparql.txt', 'w') as outfile:
            for filename in infile:
                if re.match(ve, filename):
                    outfile.write('oldbailey:' + filename + ',')

In [23]:
# write the result to file
with open('files_to_sparql.txt' , 'w') as outfile:
    for filename in files_to_sparql:
        outfile.write(filename[0] + '\n')