In [1]:
import pandas as pd
import numpy as np
import pdfplumber

import time
import os
import sys
from pathlib import Path
import csv
import re


import PyPDF2



In [2]:
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, \
                      length = 100, fill = '█', printEnd = "\r"):
    """
    code_source :https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
    Call in a loop to create terminal progress bar
    Args:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [3]:
def get_file_name(file_path):
    
    '''get files under certain path

    Args:
        file path,string

    Returns:
        A dictionary for stroing the file name and it's path

    Raise;
        ValueError,file path name

    '''
    file_dir ={}
    for dirpath, dirnames, files in os.walk(file_path, topdown=False): ## work down certain dir,has 3 returns
        
        for file_name in files:
            if file_name.endswith('.pdf'):
                file_dir[file_name] = os.path.join(dirpath,file_name)
            
    return file_dir

In [4]:
file_dir = get_file_name('./well_logs')

In [5]:
def get_survey_path(file_dir):
    
    '''get file contain 'survey'

    Args:
        file path,string

    Returns:
        A dictionary for stroing the file name and it's path

    Raise;
     
    '''
    survey_path ={}
    pattern = r'.*[Ss]urvey'                                                   #pattern
    for file,path in file_dir.items():
        find_file = re.search(pattern,file)                                    #find the file with such pattern
        if find_file:
            file_name = find_file.group(0)                                     #save the name for matching the pattern
            survey_path[file_name] = path                                      # save file name to dictionary
            
    return survey_path
            

In [6]:
survey_path = get_survey_path(file_dir)

In [7]:
def get_survey_content(survey_path):

    '''get pdf content from the survey_path
    code source:https://cloud.tencent.com/developer/article/1386517

    Args:
        survey_path, dict

    Returns:
        tables generator

    Raise;
        file not found error
    '''
    file_table = {}
    for file,path in survey_path.items():
        tables = []                                                      # save each page table to tables container
        pdf = pdfplumber.open(path)
#         print('preprocess file of',path)                                  # start of preprocess file
        for page in pdf.pages:
            for pdf_table in page.extract_tables():
                table = []                                                # each page saved to one table
                cells = []
                for row in pdf_table:
                    if not any(row):
                                                                          # 如果一行全为空，则视为一条记录结束
                        if any(cells):
                            table.append(cells)
                            cells = []
                    elif all(row):
                                                                           # 如果一行全不为空，则本条为新行，上一条结束
                        if any(cells):
                            table.append(cells)
                            cells = []
                        table.append(row)
                    else:
                        if len(cells) == 0:
                            cells = row
                        else:
                            for i in range(len(row)):
                                if row[i] is not None:
                                    cells[i] = row[i] if cells[i] is None else cells[i] + row[i]
                tables.append(table)                                          #append each page to one file container
                
                for row in table:
                    row = [re.sub('\s', '}', cell) if cell is not None else None for cell in row]
#                 print('---------- dash line ----------') ## end of preprocess the file pages
        file_table[path] = tables

        yield file_table


In [8]:

def initial_preprocess(file_table):
    '''for each file,record the table with well projection data
    
    Args:
        file_table generator
    Returns:
    
    '''
    preprocessed_file ={}
    output_csv = 'wells_survey'
    
    try:
        for file,tables in file_table.items() : ##iterate through
            file_tables = []
            for table in tables:
                table_rows = []
                if len(table) == 8 or len(table) == 9:                                # criterior of data
                    for i,row in enumerate(table):
                        row_np = np.array(row)
                        row_np_squeeze = row_np.squeeze()
    #                     print(row_np_squeeze.shape)
                        string_np = np.array2string(row_np_squeeze)
                        table_rows.append(string_np)                                    # append rows to table

                file_tables.append(table_rows)                                          # append table to tables
#                 print('-------end of file-----------')                                      #end of preprocess the file
        preprocessed_file[file] = file_tables                                          #correlate file with tables
            
    except ValueError as e:
        print(e)
        
    return preprocessed_file

In [9]:
def second_preprocess(survey_return):
    '''preprocess survey to data to have clean data format
    Args:
        survey_return ,dictionary
    Returns:
        cleaned result, dictionary
    Raise:
    
    '''
    file_splited_rows = {}
    for key,value in preprocessed_file.items():              # terate the dictionary
        value_table =[]                                       # container to save cleaned list
        value_np = np.array(value)
        value_np_squ = value_np.squeeze()                     # squeeze down the numpy array
        for array in value_np_squ:                  
            if len(array)> 0:                                 # chose those has element
                for i in range(0,len(array)):                 # iterate the array
                    row_split = array[i].split('\\n')          # get the splitted data
                    for row in row_split:                     # iterate throught the splited data
                        value_table.append(row)                # append to container

        file_splited_rows[key] = value_table                   #save data to dictionary

    return file_splited_rows

In [42]:
def get_cleaned_df(file_splited_rows):
    '''get cleaned df for each file
    if survey data is existed
    
    Args:
        filename correspond to the data list, dictionary
        file_dfs,list container for saving the preprocessed df dictionary
        
    Returns:
        filename correspond to a cleaned dataframe
        
    '''
    file_df ={}
    for key,value in file_splited_rows.items():                         # iterate the dictionary
        clean_data = []
        
        if value:                                                        # if list not empty
            rows = [] 
            col_name = value[1].split(' ')
            clean_data.append(col_name)                                  #container for saving data rows
            for i in range(len(value)):                                  # iteratet the datalist 
                if re.match(r'.\d+',value[i]):
                    rows.append(i)
            for row in rows:                                             # iter data row for saving  those data                 
                drilling_data = value[row]                                # have the drilling data
                drilling_data = drilling_data.replace("'","").split(" ")
#                 drilling_data
                clean_data.append(drilling_data)                          # save cleaned data
            cleaned_df = pd.DataFrame.from_records(clean_data)
            cleaned_df.drop(0,inplace=True,axis=0)
            file_df[key] = cleaned_df                                       # save cleaned df to dicionary
#         else:                                                             #ignore the empty list
#             file_df[key] = []
    
    return file_df

In [None]:
file_dfs = []
    
file_folder = './well_logs'                                     # file folder ready to be preprocessed
tot_length_app = 1000                                           # pregroess tracker
pdf_files = get_file_name(file_folder)                          # get all the pdf files
pdf_survey_files = get_survey_path(pdf_files)                   # get all the survey pdf files
file_table = get_survey_content(pdf_survey_files)               # get content from the survey pdf files
for i,table in enumerate(file_table):                          # iterate the generator
    preprocessed_file = initial_preprocess(table)               # fist preprocess to have the demanding data
    file_splited_rows =second_preprocess(preprocessed_file)     # have the cleaned data
    file_df = get_cleaned_df(file_splited_rows)
#     print(f'{i}s ',end=' ')
    if file_df:
        file_dfs.append(file_df)                                #save resutl to list  
    time.sleep(0.5)
    printProgressBar (i, tot_length_app, prefix = '', suffix = '', decimals = 1, \
                      length = 100, fill = '█', printEnd = "\r")


 |█---------------------------------------------------------------------------------------------------| 1.8% 

In [None]:
file_dfs