In [2]:
!pip install pdfminer.six
!pip install fitz
!pip install tools


Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506
Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.1.1-py3-none-any.whl.metadata (3.7 kB)
Collecting rdflib>=5.0.0 (from ni

In [3]:
from pdfminer.high_level import extract_text
from pathlib import Path

def extract_text_from_pdf(pdf_path):
    try:
        text = extract_text(pdf_path)
        return text.strip()
    except Exception as e:
        print(f"❌ Error reading {pdf_path}: {e}")
        return ""


In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

def extract_first_page(pdf_path):
    try:
        pages = extract_pages(pdf_path)
        texts = []
        for i, page_layout in enumerate(pages):
            if i >= 1: break  
            for element in page_layout:
                if isinstance(element, LTTextContainer):
                    texts.append(element.get_text())
        return ''.join(texts).strip()
    except Exception as e:
        print(f"❌ Failed to extract: {e}")
        return ""


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from pdfminer.high_level import extract_text
import pandas as pd
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
    try:
        text = extract_text(pdf_path)
        return {"pdf_name": pdf_path.name, "text": text}
    except Exception as e:
        return {"pdf_name": pdf_path.name, "text": "", "error": str(e)}

pdf_dir = Path("/kaggle/input/mitr-phol-gen-ai-hackathon/train_docs")
pdf_files = list(pdf_dir.glob("*.pdf"))

results = []
with ThreadPoolExecutor(max_workers=8) as executor: 
    futures = [executor.submit(extract_text_from_pdf, path) for path in pdf_files]
    for future in tqdm(as_completed(futures), total=len(futures)):
        results.append(future.result())

train_df = pd.DataFrame(results)


100%|██████████| 1509/1509 [05:01<00:00,  5.00it/s]


In [None]:

label_df = pd.read_csv("/kaggle/input/mitr-phol-gen-ai-hackathon/train.csv")


train_df["pdf_name"] = train_df["pdf_name"].str.strip().str.lower()
label_df["pdf_name"] = label_df["pdf_name"].str.strip().str.lower()


df = train_df.merge(label_df[["pdf_name", "result"]], on="pdf_name", how="left")

print(df.head())
print("จำนวน record ทั้งหมด:", len(df))
print("จำนวนที่มี label หายไป (NaN):", df["result"].isna().sum())




   pdf_name                                               text          result
0   207.pdf  ่เลม   ๑๓๖   ตอนพิเศษ   ๒๐๑    ง\n\n้หนา   ๑๔๒...  GENERAL_ACTION
1  1515.pdf  ่เลม   ๑๔๐   ตอนพิเศษ   ๑๑๐    ง\n\n้หนา   ๑๗\...     GENERAL_NON
2   785.pdf  ่เลม   ๑๓๘   ตอนพิเศษ   ๑๕๓    ง\n\n้หนา   ๖๔\...      UST_ACTION
3  2059.pdf  เลม   ๑๔๑ ตอนพิเศษ   ๒๔๘    ง\n\nหนา  ๓๑\nรา...        SSHE_NON
4  1217.pdf  ่เลม   ๑๓๙   ตอนพิเศษ   ๑๔๕    ง\n\n้หนา   ๑๐\...      TIS_ACTION
จำนวน record ทั้งหมด: 1509
จำนวนที่มี label หายไป (NaN): 0


In [None]:

import pandas as pd
import numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, accuracy_score
from pdfminer.high_level import extract_text


df["text"] = df["text"].fillna("")
df["result"] = df["result"].astype(str)


print(df["result"].value_counts().head())

result
GENERAL_ACTION    368
HR_ACTION         278
FINANCE_ACTION    189
GENERAL_NON       116
SSHE_ACTION       115
Name: count, dtype: int64


In [8]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char_wb",
        ngram_range=(3,5),
        min_df=2,
        max_features=200_000,
        lowercase=False,
    )),
    ("clf", LinearSVC(C=1.0))
])

In [None]:

X = df["text"].values
y = df["result"].values

import pandas as pd
from sklearn.model_selection import train_test_split


df["text"] = df["text"].fillna("")
df["result"] = df["result"].astype(str)


vc = df["result"].value_counts()
rare_labels = vc[vc < 2].index.tolist()

df_rare = df[df["result"].isin(rare_labels)]
df_main = df[~df["result"].isin(rare_labels)]

print("rare labels:", rare_labels)
print("df_rare:", len(df_rare), "rows | df_main:", len(df_main), "rows")


if len(df_main) > 0 and (df_main["result"].value_counts().min() >= 2):
    X_tr, X_va, y_tr, y_va = train_test_split(
        df_main["text"].values, df_main["result"].values,
        test_size=0.15, random_state=42, stratify=df_main["result"].values
    )
else:

    X_tr, X_va, y_tr, y_va = train_test_split(
        df["text"].values, df["result"].values,
        test_size=0.15, random_state=42, shuffle=True, stratify=None
    )
    df_rare = pd.DataFrame() 


import numpy as np
if len(df_rare) > 0:
    X_tr = np.concatenate([X_tr, df_rare["text"].values])
    y_tr = np.concatenate([y_tr, df_rare["result"].values])

print("Train size:", len(X_tr), "| Valid size:", len(X_va))


pipe.fit(X_tr, y_tr)
pred_va = pipe.predict(X_va)

print("\nAccuracy:", accuracy_score(y_va, pred_va))
print("Macro F1:", f1_score(y_va, pred_va, average="macro"))
print(classification_report(y_va, pred_va)[:1500])  


rare labels: ['IT_ACTION', 'ETHANOL_NON']
df_rare: 2 rows | df_main: 1507 rows
Train size: 1282 | Valid size: 227

Accuracy: 0.7136563876651982
Macro F1: 0.5443832842986224
                 precision    recall  f1-score   support

        AAW_NON       0.00      0.00      0.00         1
 BIOCHEM_ACTION       0.50      0.50      0.50         2
CONSUMER_ACTION       0.67      1.00      0.80         2
   CONSUMER_NON       0.00      0.00      0.00         1
 ETHANOL_ACTION       0.00      0.00      0.00         1
 FINANCE_ACTION       0.69      0.83      0.75        29
    FINANCE_NON       0.50      0.50      0.50         2
 GENERAL_ACTION       0.65      0.62      0.64        55
    GENERAL_NON       0.69      0.61      0.65        18
      HR_ACTION       0.84      0.98      0.90        42
         HR_NON       1.00      0.25      0.40         4
     MKT_ACTION       0.00      0.00      0.00         1
     PNP_ACTION       0.00      0.00      0.00         2
   POWER_ACTION       1.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
pipe.fit(X, y)

In [11]:
test_pdf_dir = Path("/kaggle/input/mitr-phol-gen-ai-hackathon/test_docs")
test_files = list(test_pdf_dir.glob("*.pdf"))

def read_pdf_text(p):
    try:
        return {"pdf_name": p.name, "text": extract_text(p)}
    except Exception as e:
        return {"pdf_name": p.name, "text": "", "error": str(e)}

test_rows = []
with ThreadPoolExecutor(max_workers=8) as ex:
    futs = [ex.submit(read_pdf_text, p) for p in test_files]
    for f in tqdm(as_completed(futs), total=len(futs)):
        test_rows.append(f.result())

test_df = pd.DataFrame(test_rows)
test_df["pdf_name"] = test_df["pdf_name"].str.strip().str.lower()
test_df["text"] = test_df["text"].fillna("")

100%|██████████| 1007/1007 [03:17<00:00,  5.10it/s]


In [None]:
test_df["result"] = pipe.predict(test_df["text"].values)

submission = test_df[["pdf_name", "result"]].copy()

submission = submission.sort_values("pdf_name")
submission.to_csv("submission.csv", index=False)
print("✅ Saved submission.csv with", len(submission), "rows")
submission.head

✅ Saved submission.csv with 1007 rows


<bound method NDFrame.head of      pdf_name          result
506   100.pdf  FINANCE_ACTION
230  1000.pdf  FINANCE_ACTION
668  1001.pdf  FINANCE_ACTION
959  1002.pdf     SSHE_ACTION
888  1005.pdf       HR_ACTION
..        ...             ...
710   987.pdf     GENERAL_NON
399   989.pdf     GENERAL_NON
837   991.pdf     GENERAL_NON
916   994.pdf  GENERAL_ACTION
693   999.pdf    POWER_ACTION

[1007 rows x 2 columns]>