# Split Mueller by paragraph

## Import Files 

This notebook is used to read in the Mueller pdf and split into new files based on paragraphs

In [1]:
!pip install pdfminer



In [2]:
import os
import os.path as op
from tqdm import tqdm
import re 

import numpy as np
import pandas as pd

In [3]:
# Directory where the input .pdf is located
in_path = 'Mueller-report.pdf'

# Output is the path where the split files will be saved
out_path = 'input_data/paragraphs'


In [4]:
# Create output directory if it doesn't exist

if not op.isdir(out_path):
    os.makedirs(out_path)

## Read in file and convert to text string using PDFMiner package


In [5]:
# Function for converting pdf to a string. It does this by reading in each page of the pdf 
# and converting to string. The final output is a string of all pages. 
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text




In [6]:
%%time
processed = convert_pdf_to_txt(in_path)

CPU times: user 1min 50s, sys: 1.64 s, total: 1min 51s
Wall time: 1min 53s


In [7]:
processed



## Split processed text by new lines to get list of paragraphs

In [8]:
paragraphs = processed.split("\n\n \n\n \n\n")

In [9]:
paragraphs

['Report On The Investigation Into \n\nRussian Interference In The \n2016 Presidential Election \n\nSpecial Counsel Robert S. Mueller, III \n\nSubmitted Pursuant to 28 C.F.R. § 600.8(c) \n \n\n \n \n \n \n \n \n ',
 ' \n \n\n \n\n \n \n \n \n\nVolume I of II \n\nWashington, D.C. \n\nMarch 2019 \n\nU.S. Department of Justice Attorney Work Product // May Contain Material Protected Under Fed. R. Crim. P. 6(e)\x0cU.S. Department of Justice Attorney Work Product // May Contain Material Protected Under Fed. R. Crim. P. 6(e)\x0cTABLE OF CONTENTS – VOLUME I ',
 ' ',
 ' \n\n  ',
 '\xa0\n\xa0\n\xa0\n\xa0\n\nINTRODUCTION TO VOLUME I .......................................................................................................... 1 \n\xa0\nEXECUTIVE SUMMARY TO VOLUME I............................................................................................... 4 \n\xa0\nI.  THE SPECIAL COUNSEL’S INVESTIGATION ..............................................................................

In [10]:
len(paragraphs)

4291

## Iterate through list of paragraphs and write into text files in paragraphs directory 

In [11]:
# For loop splits by paragraph and outputs each one into a new text file 

cnt = 0

for i in range(len(paragraphs)):
    if len(paragraphs[i]) < 10:
        continue
    else:
        out = open(op.join(out_path, 'paragraph_{:04d}.txt'.format(cnt)), "w+")
        out.writelines(paragraphs[i])
        out.close()
        cnt += 1
    
