# Verification

Small program to check the existence of files referenced in the directory and sort it according to the type of documents. <br>
*Bonus* : hand count and select documents for model HTR

## Import

In [None]:
!pip install pandas
!pip install numpy

In [1]:
import pandas as pd
import numpy as np
import random, os, shutil

current_folder = os.path.abspath('')

## Check csv

In [2]:
#import csv
df = pd.read_csv('data/araucania_prepared.csv', delimiter=",")

df.tail(n=50) 

Unnamed: 0,Box,Id,Type,Title,Author,Author2,Date,Location,Nb,People,Geo
204,4,354,Letters,Carta del 30 de octubre de 1860 de Vicente Vil...,"Villalón, Vicente",,1860/10/30,Chile;Octava Región;Los Ángeles,1,"Barbosa, Mauricio",
205,4,355,Letters,Carta del 30 de octubre de 1860 de Vicente Vil...,"Villalón, Vicente",,1860/10/30,Chile;Octava Región;Los Ángeles,1,"Barbosa, Mauricio",
206,4,356,Letters,Carta del 30 de octubre de 1860 de Vicente Pér...,"Pérez Rosales, Vicente",,1860/10/30,Chile;Octava Región;Concepción,1,"Barbosa, Mauricio",
207,4,357,Letters,Carta del 30 de octubre de 1860 de Vicente Pér...,"Pérez Rosales, Vicente",,1860/10/30,Chile;Octava Región;Concepción,1,"Barbosa, Mauricio",
208,4,358,Letters,Carta del 31 de octubre de 1860 de Vicente Vil...,"Villalón, Vicente",,1860/10/31,Chile;Octava Región;Los Ángeles,1,"Barbosa, Mauricio",
209,4,359,Letters,Carta del 01 de noviembre de 1860 de Manuel Se...,"Escala, Manuel Segundo",,1860/11/01,Chile;Octava Región;Concepción,1,"Barbosa, Mauricio",
210,4,360,Letters,Carta del 02 de noviembre de 1860 de José Marí...,"Carrillo, José María",,1860/11/02,Chile;Guilañanen,1,"Barbosa, Mauricio",
211,4,361,Letters,Carta del 04 de noviembre de 1860 de José Marí...,"Carrillo, José María",,1860/11/04,Chile;Octava Región;Quiapo,1,"Barbosa, Mauricio",
212,4,362,Letters,Carta del 08 de noviembre de 1860 de José Anto...,"Quezada, José Antonio",,1860/11/08,Chile;Octava Región;Arauco,1,"Barbosa, Mauricio",Arauco
213,4,363,Letters,Carta del 10 de noviembre de 1860 de Vicente V...,"Villalón, Vicente",,1860/11/10,Chile;Octava Región;Los Ángeles,1,"Barbosa, Mauricio",


## Verification

In [7]:
import glob
from collections import defaultdict


def verification(csv):
    
    """
    Function to return a csv file with a condition column indicating if any files are missing in the inventory, 
    and the different paths for each entity
    
    :csv: String, file path csv
    :return: None
    """
    
    df = pd.read_csv(csv, delimiter=",")
    
    #Creations columns
    df["Condition"] = ""
    df["files"] = ""
    
    #
    dict_files = defaultdict(list)
    dict_id = df.set_index('Id').to_dict()['Nb']
    
    #Verify condition
    for items in glob.glob("data/JPG/*.jpg"):
        #Exception for 1677 (no for 1643 which doesn't exist in repo)
        if items == "data/JPG/1677_b.jpg" or items == "data/JPG/1677_a.jpg":
            id = int(items[9:13])
            dict_files[id].append(items)
            df.loc[df['Id'] == id, 'Condition'] = "Missing files"
        else:
            if int(items[9:12]) in dict_id:
                id = int(items[9:12])
                dict_files[id].append(items)
                if len(dict_files[id]) == dict_id[id]:
                    df.loc[df['Id'] == id, 'Condition'] = "Verify !"
                elif len(dict_files[id]) == (dict_id[id] * 2):
                    df.loc[df['Id'] == id, 'Condition'] = "True"
                elif len(dict_files[id]) > dict_id[id]:
                    df.loc[df['Id'] == id, 'Condition'] = "Contains extra files"
                elif len(dict_files[id]) > 0 and len(dict_files[id]) < dict_id[id]:
                    df.loc[df['Id'] == id, 'Condition'] = "Missing files"
    
    #Loop to put all paths by id -> indexation
    for id in dict_files:
        for file in dict_files[id]:
            df.loc[df['Id'] == id, 'files'] += file
            df.loc[df['Id'] == id, 'files'] += ";"
    
    #Clean dataframe (dont work :/)
    df.replace('^$', np.nan, regex=True)
    
    #Test
    assert df['Condition'].iloc[89] == 'True', "The result should be equivalent to True"
    assert df['Condition'].iloc[118] == 'Verify !', "The result should be equivalent to 'Verify !'"
    assert len(dict_files[305]) == 4, "The result should be equivalent to 4"
    assert dict_id[195] == 1, "The result should be equivalent to 1"
    
    #Write csv #Empty cells in 'Condition' means the file don't exist
    df.to_csv(r'data/araucania_final.csv')

In [8]:
#run
verification('data/araucania_prepared.csv')

## Files management

In [7]:
def make_list(column):
    """
    Function returning a list of differents type of documents in csv column
    return: set list
    """
    list_type = []
    for type in df[column]:
        list_type.append(type)
        type_set = set(list_type)
    return type_set

In [4]:
def list_files(column, select):
    """
    Faire une liste des chemins en fonction du type
    
    :theme: Str, type
    :return: None
    """
    df = pd.read_csv('data/araucania_final.csv', delimiter=",")
    
    list_f=[]
    ids = df.loc[df[column] == select, 'files'].tolist()
    for files in ids:
        try: #Avoids problems of empty cells
            file = files.split(';')
            list_f.append(file)
        except(AttributeError):
            pass
    return list_f

def manage_folder(theme):
    """
    Creation of directory by type
    
    :return: Warning creation
    """
    if not os.path.isdir(os.path.join(current_folder, f"data/JPG_order/{theme}")):
        os.mkdir(os.path.join(current_folder, f"data/JPG_order/{theme}"))
        return print('Creation directory : ' + theme)

def manage_files(theme):
    """
    Files management in folder in according to the type of document
    
    :return: None
    """
    try:
        manage_folder(theme)
        for files in list_files('Type', theme):
            files.remove('')
            for file in files:
                basename = file[9:]
                if os.path.exists(os.path.join(current_folder, f"data/JPG_order/{theme}/{basename}")):
                    print(basename + " : the file already exists")
                else:
                    shutil.copy(file, f"data/JPG_order/{theme}/")
    except(AttributeError):
        print(theme)
        pass
    #test
    assert os.path.isfile(os.path.join(current_folder, "data/JPG_order/Notes/311_a.jpg")) == True, "File don't exist"
    assert len(os.listdir(os.path.join(current_folder, "data/JPG_order/Notes"))) == 48, "The number of files is incorrect"

In [9]:
def active_manage():
    """
    Active management by theme
    
    :return: None
    """
    for theme in make_list('Type'):
        manage_files(theme)

In [10]:
#run
active_manage()

150_b.jpg : the file already exists
150_a.jpg : the file already exists
151_1_b.jpg : the file already exists
151_1_a.jpg : the file already exists
151_2_a.jpg : the file already exists
151_2_b.jpg : the file already exists
152_b.jpg : the file already exists
152_a.jpg : the file already exists
154_a.jpg : the file already exists
154_b.jpg : the file already exists
155_b.jpg : the file already exists
155_a.jpg : the file already exists
156_b.jpg : the file already exists
156_a.jpg : the file already exists
157_b.jpg : the file already exists
157_a.jpg : the file already exists
158_b.jpg : the file already exists
158_a.jpg : the file already exists
159_b.jpg : the file already exists
159_a.jpg : the file already exists
160_a.jpg : the file already exists
160_b.jpg : the file already exists
161_a.jpg : the file already exists
161_b.jpg : the file already exists
162_b.jpg : the file already exists
162_a.jpg : the file already exists
163_a.jpg : the file already exists
163_b.jpg : the file

## Select files (Model)

In [46]:
def list_author():
    """
    Select documents by order
    
    :return: List \(set)\
    """
    
    df = pd.read_csv('data/araucania_final.csv', delimiter=",")
    
    list_author = []
    for type in df['Author']:
        list_author.append(type)
    author_set = list(set(list_author))
        
    assert "Barbosa, Mauricio" in list_author, "Missing authors, Barbosa, Mauricio !"
    assert len(author_set) == 46, "Missing authors !"
    return author_set

def model_files(name):
    
    """
    Clean list by author and create directory and copy with random selection.
    
    :name: Str -> name of author selected
    :return: True -> to may incrementation of parent function
    """
    #Cleaning
    list_choice = []
    for files in list_files('Author', name):
        files.remove('')
        for file in files:
            list_choice.append(file)
    #Selection       
    if len(list_choice) > 15 and len(list_choice) < 21:
        os.mkdir(os.path.join(current_folder, f"data/JPG_order/data_model/{name}"))
        try:
            for filename in list_choice:
                shutil.copy(file_name, f"data/JPG_order/data_model/{name}/")
            return True
        #Missing files in database
        except(FileNotFoundError):
            return print(f"{name} is missing")
            pass
    if len(list_choice) > 20:
        os.mkdir(os.path.join(current_folder, f"data/JPG_order/data_model/{name}"))
        list_name = random.choices(list_choice, k=20)
        try:
            for filename in list_name:
                shutil.copy(filename, f"data/JPG_order/data_model/{name}/")
        except(FileNotFoundError):
            return print(f"{name} is missing")
        return True
    
def execute_model():
    """
    Execution of function model by random author selection
    
    :return: List -> to do verification 
    """
    manage_folder("data_model")
    list_name = np.array(list_author())
    i = 0
    liste = []
    while i < 7:
        name = random.choice(list_name)
        liste.append(name)
        if not name == 'nan' and not os.path.isdir(os.path.join(current_folder, f"data/JPG_order/data_model/{name}")):
            model_files(name)
            if True:
                i += 1
    return liste

# try except FileNotFoundError  parce qu'il manque des fichiers dans la bdd

In [49]:
# Run
execute_model()

## BUG : Problem of return on the random. It seems that the selection of elements brings memory difficulties

['García Videla, Daniel',
 'Ramírez, Eleuterio',
 'Cordovez, Aniceto',
 'Capitán Gacitúa',
 'Díaz, José Del Carmen',
 'Williams, Juan',
 'García, Manuel']

In [43]:
#testing
list_choice = []
for files in list_files('Author', "García, Manuel"):
    files.remove('')
    for file in files:
        list_choice.append(file)
name = "García, Manuel"
if len(list_choice) > 15 and len(list_choice) < 21:
    os.mkdir(os.path.join(current_folder, f"data/JPG_order/data_model/{name}"))
    for filename in list_choice:
        shutil.copy(file_name, f"data/JPG_order/data_model/{name}/")
if len(list_choice) > 20:
    os.mkdir(os.path.join(current_folder, f"data/JPG_order/data_model/{name}"))
    list_name = random.choices(list_choice, k=20)
    for filename in list_name:
        shutil.copy(filename, f"data/JPG_order/data_model/{name}/")
        