In [1]:
import pdfplumber
import pandas as pd

def extract_table_from_pdf(pdf_path, has_header):
    data = []
    header = None
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract table settings: Look for lines
            tables = page.extract_tables({
                "vertical_strategy": "lines",
                "horizontal_strategy": "lines"
            })
            
            for table in tables:
                if has_header and header is None:
                    # Only set header once, assuming the first table on the first page contains the header
                    header = table[0] + ["page"]
                    # Skip the header row after saving it
                    table_rows = table[1:]
                else:
                    if header is None:
                        header = [f"column_{i}" for i in range(len(table[0]))] + ["page"]
                    table_rows = table

                for row in table_rows:
                    row_with_page = row + [page_num]
                    data.append(row_with_page)
                
    return header, data

has_header_mapping = {1: True, 2: False, 3: False}


number = 3
has_header = has_header_mapping[number]
pdf_path = f"sample_pdf{number}.pdf"




In [2]:
column_mappings = {
    1: { "REG": "vrm", "VEHICLE TYPE": "make"},
    2: {"column_1": "vrm", "column_2": "make"},
    3: {"column_0": "vrm",
        "column_1": "vrm",
        "column_3": "make",
        "column_4": "make",
        "column_6": "model",
        "column_7": "model"
        }
}

In [3]:
header, table_data = extract_table_from_pdf(pdf_path, has_header=has_header)

df = pd.DataFrame(table_data, columns=header)

df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,page
0,,GF16 HJE,,,Vauxhall,,,Octavia S TDi,,1.0
1,DL15 XXT,,,Vauxhall,,,Astra,,,1.0
2,,CP17 YXJ,,,FORD,,,TRANSIT,,1.0
3,MM70 JBV,,,Vauxhall,,,Corsa,,,1.0
4,,GD66 OMC,,,Skoda,,,Octavia Estate,,1.0
...,...,...,...,...,...,...,...,...,...,...
234,SD17 OUF,,,Peugeot,,,Premier RS Blue HDI S/S,,,6.0
235,,SK62 UPV,,,Renault,,,Trafic,,6.0
236,MF68 UYR,,,Ford,,,Tourneo,,,6.0
237,,LM68 LXF,,,Vauxhall,,,Vivaro,,6.0


In [4]:
def update_headers(column_mappings, df, number):
    column_map = column_mappings[number]
    # Rename the columns using the provided column mapping
    df.rename(columns=column_map, inplace=True)
    
    return df

In [5]:
df = update_headers(column_mappings, df, number)
df.to_csv(f"output{number}.csv", index=False)

In [6]:
def clean_dataframe(df, column_mappings, number):

    clean_headers = list(set(column_mappings[number].values()))

    print(df)
    print(clean_headers)

    has_null = df[clean_headers].isnull().values.any()
    if has_null:
    
        # Initialize a list to hold the cleaned data
        cleaned_data = []

        # Iterate over the DataFrame rows
        for index, row in df.iterrows():
            # Initialize a dictionary to hold the non-null values for the current row
            non_null_values = {header: None for header in clean_headers}
            
            # Iterate over each header and collect the last non-null value if available
            for header in clean_headers:

                # Get all the values from the columns that were mapped to the current header
                values = row[header].dropna().tolist()
                if values:  # If there are any non-null values
                    # Assign the last non-null value to the corresponding header in the dictionary
                    non_null_values[header] = values[-1]
            
            # Append the dictionary with non-null values to the cleaned data list
            cleaned_data.append(non_null_values)

        # Create a new DataFrame using the cleaned data
        cleaned_df = pd.DataFrame(cleaned_data, columns=clean_headers)

        return cleaned_df
    
    return df

In [7]:
df_clean = clean_dataframe(df, column_mappings, number)

          vrm           vrm      column_2      make      make column_5  \
0                  GF16 HJE                          Vauxhall            
1    DL15 XXT          None          None  Vauxhall      None     None   
2                  CP17 YXJ                              FORD            
3    MM70 JBV          None          None  Vauxhall      None     None   
4                  GD66 OMC                             Skoda            
..        ...           ...           ...       ...       ...      ...   
234  SD17 OUF          None          None   Peugeot      None     None   
235                SK62 UPV                           Renault            
236  MF68 UYR          None          None      Ford      None     None   
237                LM68 LXF                          Vauxhall            
238  SF68 KLO  Ford Pro Cab  Ford Pro Cab         7      None     None   

                       model           model column_8  page  
0                              Octavia S TDi     

In [8]:
df_clean

Unnamed: 0,model,vrm,make
0,Octavia S TDi,GF16 HJE,Vauxhall
1,Astra,DL15 XXT,Vauxhall
2,TRANSIT,CP17 YXJ,FORD
3,Corsa,MM70 JBV,Vauxhall
4,Octavia Estate,GD66 OMC,Skoda
...,...,...,...
234,Premier RS Blue HDI S/S,SD17 OUF,Peugeot
235,Trafic,SK62 UPV,Renault
236,Tourneo,MF68 UYR,Ford
237,Vivaro,LM68 LXF,Vauxhall


In [9]:
df_clean.to_csv(f"output_clean{number}.csv", index=False)