# Coreference Resolution for Textbook Contents
> A notebook for getting data from official sources and unzipping them to machine readable formats

- toc: true 
- badges: false
- comments: true
- categories: [jupyter]
- author: Nirant Kasliwal and Meghana Bhange
<!-- - image: images/chart-preview.png -->

In [None]:
# hide
# !pip install requests
# !pip install pydantic
# !pip install tqdm
# !pip install pdfminer.six

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# hide_input
import json
from pathlib import Path
from typing import List, Union

import requests
from pydantic import BaseModel
from tqdm.notebook import tqdm

import neuralcoref
import spacy
from textbook import Book, Chapter
from textbookutils import pdf_to_text

In [None]:
Path.pdfls = lambda x: [x for x in list(x.iterdir()) if x.suffix == ".pdf"]
Path.ls = lambda x: list(x.iterdir())

### Get List of Books and Download Links

In [None]:
# collapse-hide
sheet_name = "History"
books_list = (
    f"https://api.steinhq.com/v1/storages/5fd49704f62b6004b3eb63a3/{sheet_name}"
)
r = requests.get(books_list)

In [None]:
# collapse-hide
ncert_history_books = [Book(**x) for x in json.loads(r.text)]

## Download and Extract all Books

In [None]:
# collapse-show
for book in tqdm(ncert_history_books):
    book.download("../data/raw")
    book.unzip("../data/extract")

In [None]:
single_book = ncert_history_books[0]

In [None]:
pdf_files = []
for folder in single_book.extract_to_path.ls():
    pdf_files.extend(folder.pdfls())
pdf_files.sort()
pdf_files = [
    file for file in pdf_files if file.stem[-2:].isdigit()
]  # keep the chapter files, nothing else
pdf_files

# Using NeuralCorefernce By Huggingface and Spacy

In [None]:
nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp)

# Get the coreferece for each pdf file

In [None]:
coreferce_mapping_for_each_pdf = {}
for file in pdf_files:
    output_io_wrapper = StringIO()
    plain_text = pdf_to_text(file, output_io_wrapper)
    doc = nlp(plain_text)
    coreferce_mapping_for_each_pdf[file] = {
        "coref_clusters" : doc._.coref_clusters,
        "is_coref" : doc._.is_coref,
        "doc" : doc
    }

In [None]:
# ncert_history_books[1].download()
# ncert_history_books[1].unzip()