# Coreference Resolution for Textbook Contents
> A notebook for getting data from official sources and unzipping them to machine readable formats

- toc: true 
- badges: false
- comments: true
- categories: [jupyter]
- author: Nirant Kasliwal and Meghana Bhange
<!-- - image: images/chart-preview.png -->

In [None]:
# hide
!pip install requests
!pip install pydantic
!pip install tqdm
!pip install pdfminer.six
!pip uninstall spacy 
!pip uninstall neuralcoref
!pip install spacy==2.1.0 
!pip install neuralcoref --no-binary neuralcoref
!python -m spacy download en

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# hide_input
import json
from io import StringIO
from pathlib import Path
from typing import List, Union

import requests
from pydantic import BaseModel
from tqdm.notebook import tqdm

import neuralcoref
import spacy
from textbook import Book, Chapter
from textbookutils import pdf_to_text

100%|██████████| 40155833/40155833 [00:00<00:00, 67653288.57B/s]


In [4]:
Path.pdfls = lambda x: [x for x in list(x.iterdir()) if x.suffix == ".pdf"]
Path.ls = lambda x: list(x.iterdir())

### Get List of Books and Download Links

In [5]:
# collapse-hide
sheet_name = "History"
books_list = (
    f"https://api.steinhq.com/v1/storages/5fd49704f62b6004b3eb63a3/{sheet_name}"
)
r = requests.get(books_list)

In [6]:
# collapse-hide
ncert_history_books = [Book(**x) for x in json.loads(r.text)]

## Download and Extract all Books

In [7]:
# collapse-show
for book in tqdm(ncert_history_books):
    book.download("../data/raw")
    book.unzip("../data/extract")

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [8]:
single_book = ncert_history_books[0]

In [9]:
pdf_files = []
for folder in single_book.extract_to_path.ls():
    pdf_files.extend(folder.pdfls())
pdf_files.sort()
pdf_files = [
    file for file in pdf_files if file.stem[-2:].isdigit()
]  # keep the chapter files, nothing else
pdf_files

[PosixPath('../data/extract/class_6_Civics/fess3dd/fess301.pdf'),
 PosixPath('../data/extract/class_6_Civics/fess3dd/fess302.pdf'),
 PosixPath('../data/extract/class_6_Civics/fess3dd/fess303.pdf'),
 PosixPath('../data/extract/class_6_Civics/fess3dd/fess304.pdf'),
 PosixPath('../data/extract/class_6_Civics/fess3dd/fess305.pdf'),
 PosixPath('../data/extract/class_6_Civics/fess3dd/fess306.pdf'),
 PosixPath('../data/extract/class_6_Civics/fess3dd/fess307.pdf'),
 PosixPath('../data/extract/class_6_Civics/fess3dd/fess308.pdf'),
 PosixPath('../data/extract/class_6_Civics/fess3dd/fess309.pdf')]

# Using NeuralCorefernce By Huggingface and Spacy

In [10]:
nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fc1aa46c9e8>

# Get the coreferece for each pdf file

In [20]:
coreferce_mapping_for_each_pdf = {}
for file in tqdm(pdf_files):
    output_io_wrapper = StringIO()
    plain_text = pdf_to_text(file, output_io_wrapper)
    doc = nlp(plain_text)
    coreferce_mapping_for_each_pdf[file] = {
        "plain_text" : plain_text,
        "doc" : doc,
        "resolved_text" : doc._.coref_resolved,
        "coreference_clusters" : doc._.coref_clusters
    }

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

KeyboardInterrupt: ignored