In [1]:
pip install torch torchvision transformers easyocr

Collecting transformers
  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.8 kB ? eta -:--:--
     --------- ------------------------------ 10.2/43.8 kB ? eta -:--:--
     -------------------------- ----------- 30.7/43.8 kB 325.1 kB/s eta 0:00:01
     -------------------------------------- 43.8/43.8 kB 357.2 kB/s eta 0:00:00
Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.1-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting opencv-python-headless (from easyocr)
  Downloading opencv_python_headless-4.9.0.80-cp37-abi3-win_amd64.whl.metadata (20 kB)
C

In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import resnet50
import easyocr
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from PIL import Image

# Step 1: Define the CNN for image feature extraction
class CNNFeatureExtractor(nn.Module):
    def __init__(self):
        super(CNNFeatureExtractor, self).__init__()
        self.cnn = resnet50(pretrained=True)
        self.cnn.fc = nn.Identity()  # Remove the final classification layer

    def forward(self, x):
        features = self.cnn(x)
        return features

# Step 2: Setup OCR using EasyOCR
reader = easyocr.Reader(['en'])  # Initialize EasyOCR reader

# Step 3: Setup NER using Hugging Face Transformers
tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER')
ner_model = AutoModelForTokenClassification.from_pretrained('dslim/bert-base-NER')
ner_pipeline = pipeline('ner', model=ner_model, tokenizer=tokenizer)

# Step 4: Define a function to process the image and extract subjects and grades
def process_transcript(image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert('RGB')
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = transform(image).unsqueeze(0)

    # Extract features using the CNN
    cnn_extractor = CNNFeatureExtractor()
    features = cnn_extractor(image)
    
    # Perform OCR to get text from the image
    ocr_result = reader.readtext(image_path)
    text = ' '.join([item[1] for item in ocr_result])
    
    # Perform NER to extract subjects and grades
    ner_results = ner_pipeline(text)
    subjects_and_grades = [(result['word'], result['entity']) for result in ner_results]

    return subjects_and_grades

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Example usage
image_path = r'C:\Users\Schalk\OneDrive - Columbia Business School\Coding\000. data\transcripts\example2.jpg'  # Can be a PNG, JPG, etc.
subjects_and_grades = process_transcript(image_path)
print(subjects_and_grades)

[('Merlin', 'B-ORG'), ('Am', 'I-ORG'), ('##ade', 'I-ORG'), ('##us', 'I-ORG'), ('Arch', 'I-ORG'), ('##imed', 'I-ORG'), ('##es', 'I-ORG'), ('G', 'I-ORG'), ('##rad', 'I-ORG'), ('Big', 'B-LOC'), ('Brain', 'I-ORG'), ('Boulevard', 'I-ORG'), ('Al', 'I-ORG'), ('##ge', 'I-ORG'), ('Col', 'B-ORG'), ('Cal', 'B-MISC'), ('AJ', 'B-ORG'), ('Am', 'B-ORG'), ('##C', 'I-ORG'), ('Federal', 'B-ORG'), ('Government', 'I-ORG'), ('AJ', 'I-ORG'), ('##C', 'I-ORG'), ('E', 'B-MISC'), ('AJ', 'B-ORG'), ('E', 'B-MISC'), ('E', 'B-MISC')]
