In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install fpdf
!pip install pdfplumber

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=644f8000f1ee979fc57b18690a24f58cbef24b03cec23d08925d992cdf8ea996
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_1

In [3]:
import pandas as pd
import numpy as np
import re

### Creating a dataset

In [4]:
from fpdf import FPDF

pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)

data = [
    # Sports
    ("123", "2025-04-10", "Sports", "The event featured a thrilling race with 5 participants, lasting 2 hours."),
    ("127", "2025-04-14", "Sports", "The soccer team secured a 3-1 victory in the championship match."),
    ("128", "2025-04-15", "Sports", "A marathon was held in New York with over 10,000 runners."),
    ("129", "2025-04-16", "Sports", "The player scored a hat-trick in the final 15 minutes."),
    ("130", "2025-04-17", "Sports", "The basketball game went into overtime with a buzzer-beater win."),

    # Technology
    ("124", "2025-04-11", "Technology", "A new software update improved performance by 20%."),
    ("131", "2025-04-18", "Technology", "AI startup launched a chatbot that answers customer queries in real-time."),
    ("132", "2025-04-19", "Technology", "Quantum computing breakthrough enables faster data processing."),
    ("133", "2025-04-20", "Technology", "A tech company revealed a foldable screen device at the expo."),
    ("134", "2025-04-21", "Technology", "Cybersecurity firm reported a 50% reduction in phishing attacks."),

    # Health
    ("125", "2025-04-12", "Health", "A study showed 30% improvement in patient recovery."),
    ("135", "2025-04-22", "Health", "The hospital launched a new mental health awareness campaign."),
    ("136", "2025-04-23", "Health", "Doctors discovered a new treatment for chronic back pain."),
    ("137", "2025-04-24", "Health", "Fitness experts recommend 30 minutes of daily walking for heart health."),
    ("138", "2025-04-25", "Health", "A new vaccine trial showed 90% effectiveness against the virus."),
]

for line in data:
    text = " ".join(str(item) for item in line)
    pdf.multi_cell(0, 10, text)

pdf.output("sample_document.pdf")

''

In [5]:
import pdfplumber

text_data = []
with pdfplumber.open("sample_document.pdf") as pdf:
    for page in pdf.pages:
        text_data.append(page.extract_text())

full_text = "\n".join(text_data)
print(full_text)



123 2025-04-10 Sports The event featured a thrilling race with 5 participants, lasting 2 hours.
127 2025-04-14 Sports The soccer team secured a 3-1 victory in the championship match.
128 2025-04-15 Sports A marathon was held in New York with over 10,000 runners.
129 2025-04-16 Sports The player scored a hat-trick in the final 15 minutes.
130 2025-04-17 Sports The basketball game went into overtime with a buzzer-beater win.
124 2025-04-11 Technology A new software update improved performance by 20%.
131 2025-04-18 Technology AI startup launched a chatbot that answers customer queries in
real-time.
132 2025-04-19 Technology Quantum computing breakthrough enables faster data processing.
133 2025-04-20 Technology A tech company revealed a foldable screen device at the expo.
134 2025-04-21 Technology Cybersecurity firm reported a 50% reduction in phishing attacks.
125 2025-04-12 Health A study showed 30% improvement in patient recovery.
135 2025-04-22 Health The hospital launched a new ment

In [6]:
# Clean the text: remove extra newlines in the middle of sentences
cleaned_text = re.sub(r'\n(?=\w)', ' ', full_text)

# Updated regex pattern for this format
pattern = r"(\d{3}) (\d{4}-\d{2}-\d{2}) (Sports|Technology|Health) (.+?)(?=\d{3} \d{4}-\d{2}-\d{2}|$)"

# Extract matches
matches = re.findall(pattern, cleaned_text, flags=re.DOTALL)

# Convert to DataFrame
df = pd.DataFrame(matches, columns=["Report ID", "Date", "Category", "Content"])
df["Report ID"] = df["Report ID"].astype(int)

In [7]:
df.head()

Unnamed: 0,Report ID,Date,Category,Content
0,123,2025-04-10,Sports,The event featured a thrilling race with 5 par...
1,127,2025-04-14,Sports,The soccer team secured a 3-1 victory in the c...
2,128,2025-04-15,Sports,"A marathon was held in New York with over 10,0..."
3,129,2025-04-16,Sports,The player scored a hat-trick in the final 15 ...
4,130,2025-04-17,Sports,The basketball game went into overtime with a ...


In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df["Entities"] = df["Content"].apply(extract_entities)


In [9]:
df.head()

Unnamed: 0,Report ID,Date,Category,Content,Entities
0,123,2025-04-10,Sports,The event featured a thrilling race with 5 par...,"[(5, CARDINAL), (2 hours, TIME)]"
1,127,2025-04-14,Sports,The soccer team secured a 3-1 victory in the c...,"[(3, CARDINAL)]"
2,128,2025-04-15,Sports,"A marathon was held in New York with over 10,0...","[(New York, GPE), (over 10,000, CARDINAL)]"
3,129,2025-04-16,Sports,The player scored a hat-trick in the final 15 ...,"[(15 minutes, TIME)]"
4,130,2025-04-17,Sports,The basketball game went into overtime with a ...,[]


In [10]:
from transformers import pipeline

summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


In [11]:
def generate_summary(text):
    summary = summarizer("summarize: " + text, max_length=10, min_length=5, do_sample=False)
    return summary[0]['summary_text']

df["Summary"] = df["Content"].apply(generate_summary)

In [12]:
df.head()

Unnamed: 0,Report ID,Date,Category,Content,Entities,Summary
0,123,2025-04-10,Sports,The event featured a thrilling race with 5 par...,"[(5, CARDINAL), (2 hours, TIME)]",the event featured a thrilling race with 5
1,127,2025-04-14,Sports,The soccer team secured a 3-1 victory in the c...,"[(3, CARDINAL)]",the soccer team secured a 3-1 victory
2,128,2025-04-15,Sports,"A marathon was held in New York with over 10,0...","[(New York, GPE), (over 10,000, CARDINAL)]",a marathon was held in new york with
3,129,2025-04-16,Sports,The player scored a hat-trick in the final 15 ...,"[(15 minutes, TIME)]",hat-trick in final 15 minutes
4,130,2025-04-17,Sports,The basketball game went into overtime with a ...,[],the game went into overtime with a buzz


In [13]:
df['Summary'][0]

'the event featured a thrilling race with 5'

In [14]:
df['Content'][0]

'The event featured a thrilling race with 5 participants, lasting 2 hours. '

# Using Naivey Bayes for NLP

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [16]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Content"])

In [17]:
le = LabelEncoder()
y = le.fit_transform(df["Category"])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [38]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [39]:
y_pred = model.predict(X_test)

In [40]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6666666666666666


In [41]:
new_text = "A new fitness app tracked 1000 steps."
X_new = vectorizer.transform([new_text])
predicted_label = le.inverse_transform(model.predict(X_new))[0]
print("Predicted Category:", predicted_label)

Predicted Category: Health


# Better Accuracy (May possible OverFitting)

In [42]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))


SVM Accuracy: 1.0


In [43]:
new_text = "A new fitness app tracked 1000 steps."
X_new = vectorizer.transform([new_text])
predicted_label_svm = le.inverse_transform(svm.predict(X_new))[0]
print("Predicted Category (SVM):", predicted_label_svm)

Predicted Category (SVM): Health


In [44]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))


Logistic Regression Accuracy: 1.0


In [45]:
new_text = "A new fitness app tracked 1000 steps."
X_new = vectorizer.transform([new_text])
predicted_label_svm = le.inverse_transform(logreg.predict(X_new))[0]
print("Predicted Category (logistic):", predicted_label_svm)

Predicted Category (logistic): Health


# Models Saving

In [None]:
import joblib
import os
save_path = "/content/drive/MyDrive/PDF-DataAnalytics"

In [53]:
joblib.dump(vectorizer, os.path.join(save_path, "tfidf_vectorizer.joblib"))

['/content/drive/MyDrive/kompliancenowAI/tfidf_vectorizer.joblib']

In [47]:
joblib.dump(model, os.path.join(save_path, "naive_bayes_model.joblib"))

['/content/drive/MyDrive/kompliancenowAI/naive_bayes_model.joblib']

In [48]:
joblib.dump(svm, os.path.join(save_path, "svm.joblib"))

['/content/drive/MyDrive/kompliancenowAI/svm.joblib']

In [49]:
joblib.dump(logreg, os.path.join(save_path, "logreg.joblib"))

['/content/drive/MyDrive/kompliancenowAI/logreg.joblib']

In [58]:
joblib.dump(le, os.path.join(save_path, "label_encoder.joblib"))

['/content/drive/MyDrive/kompliancenowAI/label_encoder.joblib']

# USing pDF

In [50]:
import os
import re
import pandas as pd
import joblib
import pdfplumber


In [None]:
def extract_and_predict(pdf_path, model_dir="/content/drive/MyDrive/PDF-DataAnalytics"):
    # Step 1: Extract text from PDF
    text_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text_data.append(page.extract_text())
    full_text = "\n".join(text_data)

    # Step 2: Clean text
    cleaned_text = re.sub(r'\n(?=\w)', ' ', full_text)

    # Step 3: Extract data using regex
    pattern = r"(\d{3}) (\d{4}-\d{2}-\d{2}) (Sports|Technology|Health) (.+?)(?=\d{3} \d{4}-\d{2}-\d{2}|$)"
    matches = re.findall(pattern, cleaned_text, flags=re.DOTALL)

    # Step 4: Create DataFrame
    df = pd.DataFrame(matches, columns=["Report ID", "Date", "Category", "Content"])
    df["Report ID"] = df["Report ID"].astype(int)

    # Step 5: Load vectorizer and transform content
    vectorizer = joblib.load(os.path.join(model_dir, "tfidf_vectorizer.joblib"))
    X = vectorizer.transform(df["Content"])

    # Step 6: Load models
    model_nb = joblib.load(os.path.join(model_dir, "naive_bayes_model.joblib"))
    model_svm = joblib.load(os.path.join(model_dir, "svm.joblib"))
    model_logreg = joblib.load(os.path.join(model_dir, "logreg.joblib"))

    # Step 7: Load LabelEncoder
    label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))

    # Step 8: Predict
    df["NaiveBayes_Pred"] = label_encoder.inverse_transform(model_nb.predict(X))
    df["SVM_Pred"] = label_encoder.inverse_transform(model_svm.predict(X))
    df["LogReg_Pred"] = label_encoder.inverse_transform(model_logreg.predict(X))

    return df


In [None]:
df_result = extract_and_predict("/content/drive/MyDrive/PDF-DataAnalytics/sample_document.pdf")
df_result



Unnamed: 0,Report ID,Date,Category,Content,NaiveBayes_Pred,SVM_Pred,LogReg_Pred
0,123,2025-04-10,Sports,The event featured a thrilling race with 5 par...,Sports,Sports,Sports
1,127,2025-04-14,Sports,The soccer team secured a 3-1 victory in the c...,Sports,Sports,Sports
2,128,2025-04-15,Sports,"A marathon was held in New York with over 10,0...",Sports,Sports,Sports
3,129,2025-04-16,Sports,The player scored a hat-trick in the final 15 ...,Sports,Sports,Sports
4,130,2025-04-17,Sports,The basketball game went into overtime with a ...,Sports,Sports,Sports
5,124,2025-04-11,Technology,A new software update improved performance by ...,Technology,Technology,Technology
6,131,2025-04-18,Technology,AI startup launched a chatbot that answers cus...,Technology,Technology,Technology
7,132,2025-04-19,Technology,Quantum computing breakthrough enables faster ...,Health,Technology,Technology
8,133,2025-04-20,Technology,A tech company revealed a foldable screen devi...,Technology,Technology,Technology
9,134,2025-04-21,Technology,Cybersecurity firm reported a 50% reduction in...,Technology,Technology,Technology
