In [9]:
import json
import pdfplumber

try:
    with open('data/web_data.json', 'r') as f:
        project_data = json.load(f)
except FileNotFoundError:
    raise SystemExit('data/web_data.json not found, run the scraping notebook first!')

test_project = project_data['walking-and-cycling-changes-oval-elephant-and-castle']
document_text_pdf = []
for document in test_project['documents']:
    with pdfplumber.open(document) as pdf:
        for page in pdf.pages:
            text = page.extract_text_simple()
            document_text_pdf.append(text)

document_text_pdf[:5]


['D\nA\nO\nR\nE\nG\nD\nRI\nB\nT\nARK TREE\nW S\nSOUTH GH\nHI\nH\nBAYLISROADWATERLOOROAD BLACKFRIARSROAD GREATSUFFOLK STREET BOROUG LONG\nLANE\nSRitver ThHames.Ca oTshnhMpBoETiHdmt a PAaLaA lCnEsK  R’OgAeDenAsrnc hoPibnainsrkhgo Ntp’osenw  PRiOnaADrgkt oRno aBdRuOADKENtNINGTtONs WLAEMSTBMETIHN S   ILT W oERnROM dA BoDRnSCSIDtoSa.GTut .GhE t Gh ee ERwdoODrrRaagrGlekE’s’S  ROADLONDON  ROADBOROUGH    NN EE WWRIINNOGGTTAOONND    CCAAUUSSEEWWAAYY HARTPRERIN  RIOTAYD  ST GREATDOVERSTREET\nLA LAMBETH  BROOK  DRIVE The CaOSWsIN  STtle ElephCaanstt l&e NEW  KENT  ROAD\nALBERTEMBANKMENT DAOR  NOTGNINNEK LeiDsANuTrEe  RC OCSAhteDu.n rMtcrhaeryyaN’ErsWdINGTON  BUSTT WALWORTH  R HEYGATE  ST RROOADDNEY\nO\nBLACK  PRINCE  ROAD KENNINGTONO  LCTALHNOEESLLEO PEN AD\nHXUAV Poaf rSisth.  MCahruyTrOcNh  PLACE\nHARLEYFOSTRDUDRHA M ROAKDENALLN  STRIEENTGTVOAUXNHALL   S TLKAENNNEINGTON NNIENKGTON ROAPDostRACSVKETiENtARNNoyINEr fSG&tE TDL OT SNoGO c PnNuAhRdPiKol Ao KdRoOEnRsAKlNDKE N NPWINLNAGAICTNYEOGNS BRAGA

In [10]:
from docx import Document

test_project = project_data['30-205-bus-proposals']
document_text_docx = []
for document in test_project['documents']:
    if document.split('.')[-1] == 'docx':
        doc = Document(document)
        for paragraph in doc.paragraphs:
            document_text_docx.append(paragraph.text)
        
document_text_docx[:5]

['Proposed changes to bus routes 30 and 205',
 '',
 'Tell us your views',
 'We are holding a six week public consultation to hear what you think about these proposals. We want to know if you agree with them, if there is anything you do not agree with, and to understand the reasons why you feel this way.',
 '']

In [12]:
try:
    with open('data/text_data.json', 'r') as f:
            project_data = json.load(f)
except:
    print('no text_data.json found, searching for web_data.json instead...')
    try:
        with open('data/web_data.json', 'r') as f:
            project_data = json.load(f)
    except:
        print('no web_data.json found, run the scraping notebook first!')
        pass



for pname, pdata in project_data.items():
    if 'doc_texts' in pdata and pdata['doc_texts']:
        print(f'doc_texts found for {pname}, skipping...')
        continue
    project_data[pname]['doc_texts'] = []
    for document in pdata['documents']:
        ext = document.split('.')[-1]
        try:
            if ext == 'pdf':
                with pdfplumber.open(document) as pdf:
                    doc_text_pdf = ''
                    for page in pdf.pages:
                        text = page.extract_text_simple()
                        doc_text_pdf += text
                project_data[pname]['doc_texts'].append(doc_text_pdf)
            elif ext == 'docx':
                doc = Document(document)
                text = '\n'.join(p.text for p in doc.paragraphs if p.text.strip())
                project_data[pname]['doc_texts'].append(text)
            print(f'converted {ext} text for {document}')
        except:
            continue

    with open('data/text_data.json','w') as f:
        json.dump(project_data,f)
    
with open('data/text_data.json','w') as f:
    json.dump(project_data,f)
print('text data saved under data/text_data.json')
             


doc_texts found for lowering-speed-limits, skipping...
doc_texts found for bus-routes-287-687, skipping...
doc_texts found for merantun-way, skipping...
doc_texts found for a232-healthy-streets, skipping...
doc_texts found for earls-court-road-works, skipping...
doc_texts found for lsa, skipping...
doc_texts found for congestion-charge-proposals, skipping...
doc_texts found for bilton-way, skipping...
doc_texts found for a21-hastings-road, skipping...
doc_texts found for superloop-expansion, skipping...
doc_texts found for cycleway-c62, skipping...
doc_texts found for bl1-superloop, skipping...
doc_texts found for 654-bus-route, skipping...
doc_texts found for east-putney-improvements, skipping...
doc_texts found for 319-weekend-night-service, skipping...
doc_texts found for shoreditch, skipping...
doc_texts found for bus-routes-346-347-497, skipping...
doc_texts found for crossing-stonecot-hill, skipping...
doc_texts found for a24-morden-road, skipping...
doc_texts found for a21-broml

In [13]:
import pandas as pd
import matplotlib.pyplot as plt

pnames = []
doc_names = []
doc_lengths = []
doc_texts = []
for pname, pdata in project_data.items():
    pnames.append(pname)
    doc_names.append(f'{pname}-text')
    doc_texts.append(pdata['project_text'])
    doc_length = len(pdata['project_text'])
    doc_lengths.append(doc_length)
    for i, document in enumerate(pdata['doc_texts']):    
        doc_name = pdata['documents'][i].split('/')[-1]
        pnames.append(pname)
        doc_names.append(doc_name)
        doc_length = len(document.split())
        doc_lengths.append(doc_length)
        doc_texts.append(pdata['doc_texts'][i])
    

doc_df = pd.DataFrame({'project':pnames,'document':doc_names,'doc_length':doc_lengths, 'doc_text':doc_texts})
print(doc_df.head(20))

                         project                           document  \
0          lowering-speed-limits         lowering-speed-limits-text   
1          lowering-speed-limits       lowering-speed-limits_0.html   
2          lowering-speed-limits        lowering-speed-limits_1.pdf   
3                   capital-call                  capital-call-text   
4        youth-panel-application       youth-panel-application-text   
5             bus-routes-287-687            bus-routes-287-687-text   
6             bus-routes-287-687           bus-routes-287-687_0.pdf   
7             bus-routes-287-687          bus-routes-287-687_1.docx   
8             bus-routes-287-687           bus-routes-287-687_2.pdf   
9             bus-routes-287-687          bus-routes-287-687_3.docx   
10                 673-bus-route                 673-bus-route-text   
11               a1-archway-road               a1-archway-road-text   
12     a10-lincoln-road-junction     a10-lincoln-road-junction-text   
13    

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=10)

doc_df['doc_chunks'] = doc_df['doc_text'].apply(lambda x: text_splitter.split_text(x))
doc_df['num_chunks'] = doc_df['doc_chunks'].apply(len)

In [15]:
doc_df = doc_df[doc_df['num_chunks'] > 0]
doc_df['num_chunks'].describe()

count    1453.000000
mean       50.734343
std       108.860475
min         1.000000
25%         6.000000
50%        19.000000
75%        50.000000
max      1438.000000
Name: num_chunks, dtype: float64

In [16]:
chunks_df = doc_df.explode('doc_chunks')
chunks_df.head()


Unnamed: 0,project,document,doc_length,doc_text,doc_chunks,num_chunks
0,lowering-speed-limits,lowering-speed-limits-text,10432,\n\n\n\n\n\n\nCustom\n\n\n\n\n\n\n\nType of co...,Custom\n\n\n\n\n\n\n\nType of conversation:\nI...,28
0,lowering-speed-limits,lowering-speed-limits-text,10432,\n\n\n\n\n\n\nCustom\n\n\n\n\n\n\n\nType of co...,Safe Speeds\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nF...,28
0,lowering-speed-limits,lowering-speed-limits-text,10432,\n\n\n\n\n\n\nCustom\n\n\n\n\n\n\n\nType of co...,Chloe Rodgers\n\n\n\n\n\n\n\n\nLocal Communiti...,28
0,lowering-speed-limits,lowering-speed-limits-text,10432,\n\n\n\n\n\n\nCustom\n\n\n\n\n\n\n\nType of co...,Share Lowering Speed Limits Programme on Faceb...,28
0,lowering-speed-limits,lowering-speed-limits-text,10432,\n\n\n\n\n\n\nCustom\n\n\n\n\n\n\n\nType of co...,The Mayor and Transport for London (TfL) are c...,28


In [17]:
from sentence_transformers import SentenceTransformer
import os
import uuid

if os.path.exists('data/embedded_chunks_df.pkl'):
    embedded_chunks_df = pd.read_pickle('data/embedded_chunks_df.pkl')
else:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(chunks_df['doc_chunks'].tolist(), show_progress_bar=True)
    embedded_chunks_df = chunks_df.copy()
    embedded_chunks_df['embeddings'] = list(embeddings)
    embedded_chunks_df = embedded_chunks_df.drop_duplicates(subset=['doc_chunks'], keep='first')
    embedded_chunks_df['uuid'] = embedded_chunks_df.apply(lambda _: str(uuid.uuid1()), axis=1)

    embedded_chunks_df.to_pickle('data/embedded_chunks_df.pkl')

Batches:   0%|          | 0/2304 [00:00<?, ?it/s]