# Parspec Data Science Interview Assignment (Part-2)

In [3]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.7-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.7 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.7-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.7 PyMuPDFb-1.23.7


In [3]:
import pandas as pd
import requests
import fitz  # PyMuPDF
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from flask import Flask, request, jsonify

###Data preprocessing functions

In [4]:
def extract_text_from_pdf_url(pdf_url):

    """
    pdf_url: string (URL handle)
    output: string (extracted text / 'Error')
    1. Inputs a pdf URL handle
    2. Checks status of webpage
    3. If pdf file exists, returns the extracted text
    4. Else return a failure message- 'Error'
    """

    # Download the PDF content from the URL
    response = requests.get(pdf_url, verify=False)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Create a PyMuPDF document object from the PDF content
        pdf_document = fitz.open(stream=response.content, filetype="pdf")

        # Iterate through pages and extract text
        text = ""
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text += page.get_text()

        # Close the PyMuPDF document
        pdf_document.close()

        return text
    else:
        # Print an error message if the request was not successful
        return "Error"

In [5]:
def clean_and_tokenize(text):

    # Remove URL handles (e.g., @username)
    text = re.sub(r'@\S+', '', text)
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # Remove website addresses
    text = re.sub(r'www.\S+', '', text)
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)
    # Remove bullet points
    text = re.sub(r'\s*[\u2022\u2023\u25E6]\s*', ' ', text)
    # Remove punctuation and digits
    text = re.sub(f"[{string.punctuation}\d]", "", text)
    # Remove other special symbols
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove single/double characters
    lemmatized_tokens = [word for word in lemmatized_tokens if len(word)>2]

    return ' '.join(list(set(lemmatized_tokens)))

###Load the trained BERT model

In [6]:
def load_model(device):

  """
  device: str -> 'cpu'/'cuda'
  Load the trained model parameters and tokenizer
  """

  output_dir = "/content/drive/MyDrive/Parspec/"
  model = BertForSequenceClassification.from_pretrained(output_dir)
  tokenizer = BertTokenizer.from_pretrained(output_dir)

  return model.to(device), tokenizer

# Load pre-trained BERT model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, tokenizer = load_model(device)

###Create the inference pipeline

In [40]:
def inference(pdf_url, model, tokenizer):
    """
    pdf_url: str -> Enter the pdf url to be checked
    model -> load trained BERT state parameters
    tokenizer -> load pretrained BERT tokenizer

    1. Access the pdf url to extract text
    2. Clean the extracted text
    3. Evaluate the text on the trained BERT to fetch classification results
    """

    # Run the preprocessing steps: 1. Extract text from url & 2. Clean the text
    extracted_text = extract_text_from_pdf_url(pdf_url)
    if extracted_text=="Error" or extracted_text=='':
      return 'Invalid URL'
    clean_text = clean_and_tokenize(extracted_text)
    if clean_text=='':
      return 'Cannot extract sufficient text for classification!'

    # Tokenize input text
    inputs = tokenizer(clean_text, return_tensors="pt")

    # Perform inference
    model.eval()
    outputs = model(**inputs)
    logits = outputs.logits

    # Get predicted class (binary classification example)
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_class = 'Yes' if predicted_class==1 else 'No'

    # Fetch the probability of predicted class
    prob = float(max(torch.nn.functional.softmax(input = logits.data)[0]))

    return {"Is lighting product?":predicted_class, "Confidence score(%)":round(prob*100,1)}

Check the inference pipeline on a user provided url

In [41]:
test_url = 'https://www.assets.signify.com/is/content/Signify/7f3dac4eae71462583e0ad41006cad73'
inference(test_url,model,tokenizer)

{'Is lighting product?': 'Yes', 'Confidence score(%)': 81.4}