# Getting and cleaning up file names.

This notebook has contains code to extract and clean filenames (the output of search using the *regex_search* function in *tf_idf_ipynb*).

## Extracting the filenames.

In [1]:
### Written by Søren Fomsgaard, July 2020 ###

from pathlib import Path
import pandas as pd
import os, re
from operator import itemgetter

In [3]:
# a dataframe to test the function.

test_df = pd.read_excel('filtered_final_results.xlsx', sheet_name='filtered_tf-idf_results', ignore_index=True)

test_df

Unnamed: 0,file_id,text_length,tf.idf_drunk,tf.idf_beer,tf.idf_intoxicat,tf.idf_temperance,tf.idf_liquor,tf.idf_alcohol,tf.idf_brandy,tf.idf_booz,tf.idf_drunkard,average_tf.idf
0,1893-10-16_t18931016-873-punishment-48.txt,480,0.0,0.001497,0.0,0,0.000000,0.00000,0.037217,0,0.000000,0.018314
1,1801-04-15_t18010415-92-verdict521.txt,296,0.0,0.004856,0.0,0,0.008192,0.00000,0.025865,0,0.000000,0.014924
2,1845-07-07_t18450707-1541-verdict-2.txt,1703,0.0,0.000000,0.0,0,0.000000,0.00000,0.038962,0,0.000000,0.012437
3,1881-10-17_t18811017-857-punishment-20.txt,457,0.0,0.000000,0.0,0,0.000000,0.00000,0.039090,0,0.000000,0.012329
4,1770-05-30_t17700530-5-verdict31.txt,65,0.0,0.000000,0.0,0,0.000000,0.00000,0.039262,0,0.000000,0.010906
...,...,...,...,...,...,...,...,...,...,...,...,...
111,1853-06-13_t18530613-673-punishment-4.txt,26,0.0,0.000000,0.0,0,0.000000,0.00000,0.098155,0,0.000000,0.004362
112,1872-09-23_t18720923-702-punishment-43.txt,23,0.0,0.000000,0.0,0,0.000000,0.00000,0.110958,0,0.000000,0.004343
113,1902-10-20_t19021020-751-punishment-51.txt,137,0.0,0.000000,0.0,0,0.000000,0.03734,0.000000,0,0.074594,0.004329
114,1862-05-12_t18620512-597-punishment-89.txt,57,0.0,0.000000,0.0,0,0.000000,0.00000,0.134318,0,0.000000,0.004324


In [4]:
# a function to extract the filenames of found files (*trimmedOutput* from *regex_search*).

def filenameGetter(dataframe, copyfiles=False):
    """
    Get filenames from a dataframe. 
    Optional: copy files from sourcedir to targetdir.

    :param dataframe: a pandas dataframe.

    Keyword argument:
    copyfiles -- if True, perform copy given user input vars.
    """
    # compile re to extract names for SPAQRL query on fileid-ID only.
    clean_pattern = re.compile('(?<=_)\d*\w+-\w+|(?<=_)\d+') # 'negative lookbehind, wordcharacter-any number of digits OR just a string of numbers after an underscore.' BE AWARE THAT HTIS IS TAILOR-MADE to our search results.
    
    # check if user wants to copy files.
    if copyfiles == True:
        sourceDir = input('Input SOURCEDIR of files: ')
        targetDir = input('Input TARGETDIR of files: ')

    # get filenames from dataframe
    rawFilenames = dataframe['file_id'].tolist() #depending on whether or not file IS the index column..

    strippedFilenames = [match for match in [clean_pattern.findall(filename) for filename in rawFilenames]]

    filteredList  = list(filter(None, strippedFilenames)) #filter out empty 'matches',

     #print(len(rawFilenames))
     #print(len(strippedFilenames)) ## this is just to check that we have the exact same number of filenames extracted as there are filenames in the results

    
    
    return sorted(filteredList, key=itemgetter(0)) # return a sorted list of the final result.

    

In [5]:
# function call for testing
files_to_sparql = filenameGetter(test_df)

In [6]:
files_to_sparql

[['t17510417-39'],
 ['t17520914-61'],
 ['t17641212-13'],
 ['t17641212-14'],
 ['t17661022-44'],
 ['t17700530-5'],
 ['t17730626-39'],
 ['t17750913-38'],
 ['t17750913-99'],
 ['t17820515-62'],
 ['t17950916-77'],
 ['t17971206-5'],
 ['t18010415-92'],
 ['t18011028-29'],
 ['t18040912-38'],
 ['t18071202-38'],
 ['t18081126-38'],
 ['t18090215-68'],
 ['t18090920-120'],
 ['t18100606-46'],
 ['t18101205-66'],
 ['t18120513-33'],
 ['t18130915-41'],
 ['t18141130-86'],
 ['t18180909-233'],
 ['t18200517-14'],
 ['t18230115-12'],
 ['t18250407-233'],
 ['t18270531-202'],
 ['t18280221-208'],
 ['t18290716-93'],
 ['t18291029-260'],
 ['t18300415-132'],
 ['t18330214-131'],
 ['t18341124-21'],
 ['t18351026-2212'],
 ['t18380101-437'],
 ['t18380226-836'],
 ['t18380709-1733'],
 ['t18390408-1465'],
 ['t18400511-1445'],
 ['t18400615-1674'],
 ['t18410301-793'],
 ['t18410301-948'],
 ['t18410405-1164'],
 ['t18420228-862'],
 ['t18420919-2699'],
 ['t18430403-1303'],
 ['t18430403-1346'],
 ['t18430612-1734'],
 ['t18430612-1997']

In [7]:
# re for the Victorian Era

ve = '[a-z]*18[2-9]\d*-*\w+|\[a-z]*19\d+-*\w+'

In [9]:
# read the files
with open('filtered_drunk_only_files_to_sparql.txt', 'r') as infile:
        with open('filtered_VE_drunk_files_to_sparql.txt', 'w') as outfile:
            for filename in infile:
                if re.match(ve, filename):
                    outfile.write('oldbailey:' + filename + ',')

In [8]:
# write the result to file
with open('filtered_drunk_only_files_to_sparql.txt' , 'w') as outfile:
    for filename in files_to_sparql:
        outfile.write(filename[0] + '\n')