# Extract table from PDF ( if it's digital pdf file )

## <span style="color:red">Camelot n'arrive pas à extrire toutes les lignes</span>


In [1]:
import camelot
import pandas as pd

# Lecture avec Camelot
tables = camelot.read_pdf("C:/Users/Mamie/Documents/Python/DCS_Devices_tools/AMPT_Tool_01/AMPTconfig/Ampt Gen3CU.pdf", pages="all", flavor="stream")

# Fusionner toutes les tables
dfs = [t.df for t in tables]
df = pd.concat(dfs, ignore_index=True)

# Nettoyer les retours à la ligne
df = df.replace(r"\n", " ", regex=True)

# Supprimer les lignes d'en-tête "Start Offset Size ..."
df = df[~df[0].str.contains("Start", na=False)]
df = df.dropna(how="all")

# Renommer les colonnes correctement
df.columns = ["Start/Offset", "Size", "Name", "Type", "R/W", "Description"]

# Réinitialiser l’index
df = df.reset_index(drop=True)

# Affichage
print("Aperçu du tableau extrait :")
(df.head(20))


Aperçu du tableau extrait :


Unnamed: 0,Start/Offset,Size,Name,Type,R/W,Description
0,5,16,Manufacturer,string,R,A well-known value registered with SunSpec for
1,,,,,,compliance: Ampt
2,21,16,Model,string,R,Manufacturer specific value Communication Unit
3,37,8,Reserved,-,-,
4,45,8,Version,string,R,Software Version
5,53,16,Serial Number,string,R,Manufacturer specific value
6,69,1,Device Address,int16,R/W,Modbus Device ID
7,71,1,ID,uint16,R,Ampt SunSpec Vendor Code 64050
8,72,1,L,uint16,R,Variable number of 16-bit registers to follow:...
9,73,1,DCA_SF,int16,R,Current scale factor


In [2]:
# Ajouter les 3 premières lignes manuellement si manquantes
missing_rows = [
    ["1", "2", "SID", "uint32", "R", "A well-known value 0x53756e53, uniquely identifies this as a SunSpec Modbus Map"],
    ["3", "1", "ID", "uint16", "R", "A well-known value 1, uniquely identifies this as a SunSpec Common Model"],
    ["4", "1", "L", "uint16", "R", "Well-known # of 16-bit registers to follow: 66"]
]
if not df.iloc[0].astype(str).str.contains("SID").any():
    print("⚠️ Ligne SID absente → ajout manuel des 3 premières lignes")
    for row in reversed(missing_rows):
        df.loc[-1] = row  # insérer avant
        df.index = df.index + 1
    df = df.sort_index().reset_index(drop=True)
else:
    print("✅ La ligne SID est déjà présente")

⚠️ Ligne SID absente → ajout manuel des 3 premières lignes


In [7]:
# Affichage
print("Aperçu du tableau extrait :")
df

Aperçu du tableau extrait :


Unnamed: 0,Start/Offset,Size,Name,Type,R/W,Description
0,1,2,SID,uint32,R,"A well-known value 0x53756e53, uniquely identi..."
1,3,1,ID,uint16,R,"A well-known value 1, uniquely identifies this..."
2,4,1,L,uint16,R,Well-known # of 16-bit registers to follow: 66
3,5,16,Manufacturer,string,R,A well-known value registered with SunSpec for
4,,,,,,compliance: Ampt
5,21,16,Model,string,R,Manufacturer specific value Communication Unit
6,37,8,Reserved,-,-,
7,45,8,Version,string,R,Software Version
8,53,16,Serial Number,string,R,Manufacturer specific value
9,69,1,Device Address,int16,R/W,Modbus Device ID


In [4]:
# Sauvegarde
df.to_csv("Ampt_Modbus_Map.csv", index=False, encoding="utf-8")
df.to_excel("Ampt_Modbus_Map.xlsx", index=False)