# Importing the Dependencies

In [91]:
import pdfplumber
import pandas as pd
import numpy as np
import scipy as sp
import camelot
import re
import spacy
import os
os.environ["PATH"] += os.pathsep + r"C:\Program Files\gs\gs10.04.0\bin"
import csv

## Processing Research Papers

In [93]:
# Phase 1 : Processing research papers
# Helper function to process research papers (pdf formats) from a folder
def process_papers(folder_path):
    all_text = ""
    all_tables = []
    
    # Opening all files in the folder
    for filename in os.listdir(folder_path): 
        if filename.endswith(".pdf"):
            # Getting the full path of the file
            pdf_path = os.path.join(folder_path, filename)
            # Extract text and tables
            text, tables = extract_text_and_tables(pdf_path)
            all_text += text
            all_tables.extend(tables)  # Append tables to the global list

    return all_text, all_tables

## Information extraction

In [95]:
# Function to extract text and tables from a page
def extract_text_and_tables(pdf_path):
    text = ""
    tables = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from the page
            text += page.extract_text() or ''  # In case text is None
            
        # Extract tables from all pages using Camelot
        # Set pages='all' to extract from all pages at once
        extracted_tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
        tables.extend(extracted_tables)
    
    return text, tables

## Information Tester

In [97]:
# Function to print all tables
def get_tablesnrows(tables):
    for i, table in enumerate(tables):
        print(f"\nTable {i + 1}:")
        print(table.df.to_string(index=False))
        print("")

## Table Pattern Matcher

In [99]:
# Regex filter to identify valid table headers
def filter_actual_tables(text):
    # Regex pattern to match table headers (e.g., "Table 1", "Table 2")
    pattern = r'^\s*Table\s+([1-9]\d*)\s*'
    match = re.match(pattern, text)
    return bool(match)

## True Tables

In [101]:
# Function to print only true tables with valid headers
def get_truetables(tables):
    true_tables = []
    for i, table in enumerate(tables):    
        first_row = table.df.iloc[0]
        first_row_str = first_row.to_string(index=False)
        if filter_actual_tables(first_row_str):
            print(f"\nTrue Table {i + 1}:")
            print(table.df.to_string(index=False))
            true_tables.append(table.df)  # Collect the valid tables
    return true_tables  # Return the list of true tables

## Saving To Excel

In [103]:
def save_tables_to_excel(tables, output_excel_path):
    if not tables:
        print("No tables to save. The Excel file will not be created.")
        return
    
    try:
        with pd.ExcelWriter(output_excel_path) as writer:
            for i, table in enumerate(tables):
                # Save each table to a separate sheet in the Excel file
                sheet_name = f"Table_{i + 1}"
                table.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"\nTables saved to Excel file at: {output_excel_path}")
    except Exception as e:
        print(f"Error while saving tables to Excel: {e}")


### MAIN():

In [79]:
# Assigning the folder paths
#folder_path = "D:\\3 SEM UWIN\\INTERNSHIP SHAFAQ\\Project\\Papers"
#output_excel_path = "D:\\3 SEM UWIN\\INTERNSHIP SHAFAQ\\Project\\Extracted_Tables.xlsx"  # Path for Excel output

In [105]:
# Process papers and extract data
text, tables = process_papers(folder_path)

In [83]:
# Display all the text
print(text)

Food Chemistry 240 (2018) 588–593
ContentslistsavailableatScienceDirect
Food Chemistry
journal homepage: www.elsevier.com/locate/foodchem
ff
E ect of processing on the in vitro and in vivo protein quality of red and
T
green lentils (Lens culinaris)
Matthew G. Nosworthya, Gerardo Medinaa, Adam J. Franczyka, Jason Neufelda, Paulyn Appahb,
Alphonsus Utiohb, Peter Frohlichc, James D. Housea,d,e,f,g,⁎
aDepartmentofFoodandHumanNutritionalSciences,UniversityofManitoba,Winnipeg,MBR3T2N2,Canada
bFoodDevelopmentCentre,PortagelaPrairie,MBR1N3J9,Canada
cCanadianInternationalGrainsInstitute,Winnipeg,MBR3C3G7,Canada
dRichardsonCentreforFunctionalFoodsandNutraceuticals,UniversityofManitoba,Winnipeg,MBR3T2N2,Canada
eDepartmentofFoodScience,UniversityofManitoba,Winnipeg,MBR3T2N2,Canada
fCanadianCentreforAgri-FoodResearchinHealthandMedicine,UniversityofManitoba,Winnipeg,MBR3T2N2,Canada
gDepartmentofAnimalScience,UniversityofManitoba,Winnipeg,MBR3T2N2,Canada
A R T I C L E I N F O A B S T R A C T
Keywords

In [85]:
# Display all tables
get_tablesnrows(tables)


Table 1:
                             0                                                                                                                      1                                                                                                                   2
                                                                                                                                                                                                                       journal homepage: www.elsevier.com/locate/foodchem
                                                                           Eﬀect of processing on the in vitro and in vivo protein quality of red and                                                                                                                    
                                                                                                                                                                                                

In [87]:
# Displaying the True Tabular Data
print("\n\n..................GETTING TRUE TABLES..................\n")
true_tables = get_truetables(tables)



..................GETTING TRUE TABLES..................


True Table 5:
                                                                                                                                           0     1    2     3    4    5    6     7    8    9    10   11   12    13   14   15   16   17   18   19   20   21
                                                                                                                                      Table 1                                                                                                             
Proximate analysis and amino acid composition of untreated, extruded, cooked and baked red and green lentil ﬂour presented on an as-is basis.                                                                                                             
                                                                                                                                               %DMa %CFc  %CPb  ASP  THR  SER

In [89]:
# Save the true tables to an Excel file
save_tables_to_excel(true_tables, output_excel_path)


Tables saved to Excel file at: D:\3 SEM UWIN\INTERNSHIP SHAFAQ\Project\Extracted_Tables.xlsx
