### Install Libraries

In [1]:
!pip install -Uqq pypdfium2
!pip install -Uqq tabula-py
!pip install -Uqq pdfplumber

### Load Libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pypdfium2 as pdfium
import pandas as pd
import numpy as np
import os.path
import tabula
from tabula.io import read_pdf
import pdfplumber


### Create functions

In [3]:
# This function takes pdf as an input
# converts into text
# returns the required fields
    
def TeraNovaText(folder,file):
    pdf_fium = pdfium.PdfDocument(folder+'/'+file) # Read the PDF as a text File
    pdf_plumb = pdfplumber.open(folder+'/'+file) # Read the PDF as a text File
    
    no_of_pages = len(pdf_fium)
    
    out= []
    
    for i in range(no_of_pages):
        temp_out = []
        
        page_fium = pdf_fium[i] # pdfium
        page_plumb = pdf_plumb.pages[i] # PDF Plumber
        
        # Load a text page helper
        textpage_fium = page_fium.get_textpage() # pdfium
    
        # Extract text from the whole page
        text_fium = textpage_fium.get_text_range() # pdfium
        text_plumb = page_plumb.extract_text() # PDF Plumber
    
        # Split the list with the delimiters
        list_fium = text_fium.split('\r\n') # pdfium
        list_plumb = text_plumb.split('\n')  # PDF Plumber
    
    
        # Store the requird fields
    
        # Data Ingestion_datetime
        Ingestion_datetime = pd.to_datetime('today').strftime("%d/%m/%Y %I:%M:%S")
    
        # City
        City = list_plumb[3].split(',')[0].split(' ')[-1]
    
        # DATE MOLDED
        date_molded_src = "DATE MOLDED"
        date_molded_src_get_string = [x for x in list_fium if date_molded_src in x]
        date_molded_src_lst_str = ''.join(date_molded_src_get_string)
        Date_Molded = date_molded_src_lst_str.split(' ')[2]
    
        # DATE ISSUED
        date_issue_src = "DATE ISSUED"
        date_issue_src_get_string = [x for x in list_fium if date_issue_src in x]
        date_issue_src_lst_str = ''.join(date_issue_src_get_string)
        Date_Issued = date_issue_src_lst_str.split(':')[1].strip()
    
        # LAB NUMBER
        Lab_Number = date_molded_src_lst_str.split(' ')[5]
    
        # LOCATION OF PLACEMENT
        colon = ":"
        lop_searc = "LOCATION OF PLACEMENT"
        lop_searc_get_string = [x for x in list_fium if lop_searc in x]
        lop_searc_lst_str = ''.join(lop_searc_get_string)

        loc = lop_searc_lst_str

        if any(c in colon for c in loc):
            Location_of_Placement = loc.split(':')[1].strip()
        else:
            loc = loc.split(' ')
            loc.insert(3, ':')
            loc = " ".join(loc) 
            Location_of_Placement = loc.split(':')[1].strip()
        
        # CONCRETE SUPPLIER
        supplier_src = "CONCRETE SUPPLIER"
        supplier_src_get_string = [x for x in list_fium if supplier_src in x]
        supplier_src_lst_str = ''.join(supplier_src_get_string)
        Concrete_Supplier = supplier_src_lst_str.split(':')[1].strip()
    
        # MIX ID NO
        mixid_src = "MIX ID NO"
        mixid_src_get_string = [x for x in list_fium if mixid_src in x]
        mixid_src_lst_str = ''.join(mixid_src_get_string)
        Mix_ID_No = mixid_src_lst_str.split(':')[1].strip()
    
        # WEATHER
        weather_src = "WEATHER"
        weather_src_get_string = [x for x in list_fium if weather_src in x]
        if weather_src_get_string:
            weather_src_lst_str = ''.join(weather_src_get_string)
            Weather = weather_src_lst_str.split(':')[1].strip()
            
        else:
            weather_src = "W EATHER"
            weather_src_get_string = [x for x in list_fium if weather_src in x]
            weather_src_lst_str = ''.join(weather_src_get_string)
            Weather = weather_src_lst_str.split(':')[1].strip()
    
        # TIME MOLDED
        time_src = "TIME MOLDED"
        time_src_get_string = [x for x in list_plumb if time_src in x]
        time_src_lst_str = ''.join(time_src_get_string)
        Time_Molded = time_src_lst_str.split(' ')[2] + ' ' +time_src_lst_str.split(' ')[3]
    
        # AIR CONTENT(%)
        aircontent_src = "AIR CONTENT"
        aircontent_src_get_string = [x for x in list_fium if aircontent_src in x]
        aircontent_src_lst_str = ''.join(aircontent_src_get_string)
        Air_Content = aircontent_src_lst_str.split(':')[1].strip()
    
        # SLUMP(IN)
        slump_src = "SLUMP(IN)"
        slump_src_get_string = [x for x in list_fium if slump_src in x]
        slump_src_lst_str = ''.join(slump_src_get_string)
        Slump_space_chck = slump_src_lst_str.split(':')
        if len(Slump_space_chck[1])>5:
            Slump = Slump_space_chck[1].split(' ')[1].strip()
        else:
            Slump = Slump_space_chck[1].strip()
        
    
        # SIZE & REQUIRED PSI 
        size_psi_search = "NO. SUBMITTED"
        size_psi_get_string = [x for x in list_fium if size_psi_search in x]
        size_psi_lst_str = ''.join(size_psi_get_string)
        size_psi = size_psi_lst_str.split(':')[2].strip()
    
        if len(size_psi_lst_str.split(':')[1])<8:
            if len(size_psi)>18:
                Size = size_psi[0:5]
                Required_PSI = size_psi[25:].strip()    
            else:
                Size = size_psi[0:5]
                Required_PSI = size_psi_lst_str.split(':')[3].strip()
        else:
            get_size = size_psi_lst_str.split(':')[1].strip()
            Size = get_size[6:12].strip()
            Required_PSI = size_psi_lst_str.split(':')[2].strip()
        
        # WATER ADDED(GALS)    
        water_searc = "GALS"
        water_searc_get_string = [x for x in list_fium if water_searc in x]
        water_searc_lst_str = ''.join(water_searc_get_string)
        Water_Added_len_check = water_searc_lst_str.split(':')
        if len(Water_Added_len_check[1])>5:
            Water_Added = Water_Added_len_check[1].split(' ')[1].strip()
        else:
            Water_Added = Water_Added_len_check[1].strip()
    
        # UNIT WEIGHT(PCF):
        unit_searc = "PCF"
        unit_searc_get_string = [x for x in list_fium if unit_searc in x]
        unit_searc_lst_str = ''.join(unit_searc_get_string)
        Unit_Weight = unit_searc_lst_str.split(':')[1].strip()
    
        # AMBIENT TEMP(F):
        ambi_temp_searc = "AMBIENT TEMP"
        ambi_temp_searc_get_string = [x for x in list_fium if ambi_temp_searc in x]
        ambi_temp_searc_lst_str = ''.join(ambi_temp_searc_get_string)
        Ambient_Temp = ambi_temp_searc_lst_str.split(':')[1].strip()
    
        # CONCRETE TEMP(F):
        concrete_temp_searc = "CONCRETE TEMP"
        concrete_temp_searc_get_string = [x for x in list_fium if concrete_temp_searc in x]
        concrete_temp_searc_lst_str = ''.join(concrete_temp_searc_get_string)
        Concrete_Temp = concrete_temp_searc_lst_str.split(':')[1].strip()
        
        temp_out = [file,Ingestion_datetime,City,Date_Molded,Date_Issued,Lab_Number,Location_of_Placement
       ,Concrete_Supplier,Mix_ID_No,Weather,Time_Molded,Air_Content,Slump,Size
       ,Required_PSI,Water_Added,Unit_Weight,Ambient_Temp,Concrete_Temp]
        
        out.append((temp_out))
    
    return out



In [4]:
# This function takes pdf as an input
# converts into a dataframe
# returns the test results as a DataFrame

def TeraNovaPdf2Table(folder,file,result):
    readpdf2df = tabula.io.read_pdf(folder+'/'+file 
                             , pages='all'
                             #,output_format="dataframe"
                                   , multiple_tables=True)
    
    df = pd.concat(readpdf2df)
    age_load_read = df.iloc[1:,3].dropna().reset_index().drop(columns=['index']).iloc[:-1,]
    
    drp_extra = age_load_read[age_load_read.iloc[:,0].str.contains('diameter', case=False, na=False)].index
    age_load_read = age_load_read.drop(drp_extra)
    drp_extra = age_load_read[age_load_read.iloc[:,0].str.contains('compression', case=False, na=False)].index
    age_load_read = age_load_read.drop(drp_extra)
    drp_extra = age_load_read[age_load_read.iloc[:,0].str.contains('age', case=False, na=False)].index
    age_load_read = age_load_read.drop(drp_extra)
    
    # Extra Space in the Data Check
    space_check = age_load_read.iloc[:,0].str.split(' ').tolist()
    
    space_check_ref = []
    for i in space_check:
        for j in i:
            if len(i)>4:
                i[1]=i[1]+i[2]
                i.pop(2)
        space_check_ref.append(i)
    space_check = space_check_ref
    age_load_read = pd.DataFrame(space_check, columns =['Age_Days','Date_Tested','Total_load', 'Unit_load'])
    final_df = pd.DataFrame()
    for i in range(len(result)):
        df_split = np.array_split(age_load_read, len(result))
        
        result_lab = result[i][5]
        result_date_isssue = result[i][4]
        
        df_sel = df_split[i]
        
        iterat = df_sel.shape[0]
        
        lab_nbr = pd.DataFrame({'Lab_Number': result_lab}, index=[0])
        date_isu = pd.DataFrame({'Date_Issued': result_date_isssue}, index=[0])
        
        lab_nbr_itr = pd.DataFrame(np.repeat(lab_nbr.values, iterat, axis=0))
        date_isu_itr = pd.DataFrame(np.repeat(date_isu.values, iterat, axis=0))
        
        lab_nbr_itr.columns = lab_nbr.columns
        date_isu_itr.columns = date_isu.columns
        
        temp_df = pd.concat([lab_nbr_itr,date_isu_itr,age_load_read.iloc[:, age_load_read.columns != 'Total_load']], axis=1, join='inner')
        final_df = final_df.append(temp_df).reset_index(drop=True)
        
    
    return final_df
    
    
    

### Main Function.  Make sure to change the path of the testing folder befor running it.

In [5]:
log = []
df_text = []
pdf_df = pd.DataFrame()

#path = '/Users/prateek/Documents/Captsone Project/Jupiter Files/TestFolder'
os.chdir('/Users/prateek/Documents/Captsone Project/Test Folder')
path = '/Users/prateek/Documents/Captsone Project/Test Folder/sample tera nova test'

for file in os.listdir(path+'/'):
    if file.endswith('.pdf'):
        read_datetime = pd.to_datetime('today').strftime("%d/%m/%Y %I:%M:%S")
        folder = os.path.basename(path)
        dir_path = os.path.dirname(path)
        
        log.append((dir_path,folder,file,read_datetime))  # Appends results for log DF
        
        result = TeraNovaText(folder,file) # Calls text function
        for j in range(len(result)):
            df_text.append((result[j]))    # Appends results from text function
        
        df_out = TeraNovaPdf2Table(folder,file,result) # Calls Dataframe function
        pdf_df = pdf_df.append(df_out).reset_index(drop=True) # Appends test results DF
        
        
          

# Append Log DataFrame        
df_log = pd.DataFrame(log, columns=['Directory','Folder','File', 'Read_DateTime'])

# Append DataFrame with Text fields
df_page = pd.DataFrame(df_text, columns=['File_Name',
                     'Ingestion_datetime',
                     'City',
                     'Date_Molded',
                     'Date_Issued',
                     'Lab_Number',
                     'Location_of_Placement',
                     'Concrete_Supplier',
                     'Mix_ID_No',
                     'Weather',
                     'Time_Molded',
                     'Air_Content(%)',
                     'Slump(in)',
                     'Size',
                     'Required_PSI',
                     'Water_Added(GALS)',
                     'Unit_Weight(PCF)',
                     'Ambient_Temp(F)',
                     'Concrete_Temp(F)'])




In [6]:
#log

In [7]:
#result

In [8]:
#df_text

In [9]:
#df_log

In [10]:
#df_page

In [12]:
#pdf_df

### Save the Data in Excel Files

In [13]:
df_page.to_excel('COMPRESSION_TEST_SPECIMENS_REPORT.xlsx', index=False)
pdf_df.to_excel('COMPRESSION_TEST_RESULTS.xlsx', index=False)
df_log.to_excel('INGESTION_LOG.xlsx', index=False)

print("Success!!")

Success!!
