In [49]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [50]:
BASE_DIR = Path().resolve().parent
DATA_PATH = BASE_DIR / "data" / "labelled" / "features_labeled.csv"
PROCESSED_DIR = BASE_DIR / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

In [51]:
# Load Data

df = pd.read_csv(DATA_PATH)

if "Id" in df.columns:
    df = df.drop(columns=["Id"])

print("Initial Shape:", df.shape)
print("Sample view:")
print(df.head)

Initial Shape: (728, 16)
Sample view:
<bound method NDFrame.head of       type  page_number                                            content  \
0     Text            1                                    General Summary   
1    Image            1                                    page1_img1.jpeg   
2     Text            1                          Nier Building Inspections   
3     Text            1          Master Code Professional - # 5169207- MCP   
4     Text            1                    ICC Fire Inspector (Commercial)   
..     ...          ...                                                ...   
723   Text           34  specific benefit of the customer(s), secondary...   
724   Text           34  inspection to meet their specific needs and to...   
725   Text           34  Prepared Using HomeGauge http://www.HomeGauge....   
726   Text           34                                   176 Ormond St SE   
727   Text           34                                      Page 34 of 34

In [52]:
# Drop Noise Columns

drop_cols = ["page_number", "content", "type", "contains_numbering"]
df = df.drop(columns=[col for col in drop_cols if col in df.columns])
print("Shape after dropping noise cols:", df.shape)
print("Sample view:")
print(df.head)

Shape after dropping noise cols: (728, 12)
Sample view:
<bound method NDFrame.head of      font_size  font_size_relative  is_bold  uppercase_ratio  text_length  \
0         18.0            1.709285        1         0.142857           15   
1          0.0            0.000000        0         0.000000            0   
2         12.0            1.139524        1         0.130435           25   
3         12.0            1.139524        1         0.240000           41   
4         12.0            1.139524        1         0.230769           31   
..         ...                 ...      ...              ...          ...   
723       10.0            0.949603        0         0.000000          121   
724       10.0            0.949603        0         0.000000           99   
725        9.0            0.854643        0         0.174603           79   
726       12.0            1.139524        1         0.400000           16   
727       12.0            1.139524        0         0.166667       

In [53]:
# Fill numeric NaN with median

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols:
    df[col] = df[col].fillna(0)

In [54]:
# ---------------------------------------
# Min-Max Scaling
# ---------------------------------------

scaler = MinMaxScaler()

df[num_cols] = scaler.fit_transform(df[num_cols])

print("\nAfter Scaling:")
print(df.head())


After Scaling:
   font_size  font_size_relative  is_bold  uppercase_ratio  text_length  \
0   1.000000            1.000000      1.0         0.142857     0.116279   
1   0.000000            0.000000      0.0         0.000000     0.000000   
2   0.666667            0.666667      1.0         0.130435     0.193798   
3   0.666667            0.666667      1.0         0.240000     0.317829   
4   0.666667            0.666667      1.0         0.230769     0.240310   

   y_position  block_width  image_area  table_rows  table_columns       label  \
0    0.060040     0.274438       0.000         0.0            0.0     Heading   
1    0.111360     0.192120       0.225         0.0            0.0       Image   
2    0.234927     0.257510       0.000         0.0            0.0  Subheading   
3    0.271704     0.443922       0.000         0.0            0.0  Subheading   
4    0.290726     0.781008       0.000         0.0            0.0  Subheading   

   image_aspect_ratio  
0                 0.0 

In [55]:
# Save

OUTPUT_PATH = PROCESSED_DIR / "pdf_featured_processed_dataset.csv"
df.to_csv(OUTPUT_PATH, index=False)

print(f"\n✔ Saved to: {OUTPUT_PATH}")
print(f"   Final Shape: {df.shape}")



✔ Saved to: /home/ocs-22/Documents/pdf-structure-analyze-model/data/processed/pdf_featured_processed_dataset.csv
   Final Shape: (728, 12)
