In [1]:
import os
import glob
import pandas as pd
import pdfplumber

INPUT_FOLDER = r"data"
CSV_OUTPUT = r"outbreak_master.csv"

all_records = []

pdf_files = glob.glob(os.path.join(INPUT_FOLDER, "*.pdf"))
print(f"Found {len(pdf_files)} PDFs")

for pdf in pdf_files:
    try:
        with pdfplumber.open(pdf) as doc:
            # skip first 2 pages
            for page_num, page in enumerate(doc.pages[2:], start=3):
                text = page.extract_text()
                if not text:
                    continue

                for line in text.split("\n"):
                    parts = line.strip().split()
                    if len(parts) < 10:
                        continue  # too short to be a valid row

                    # Identify fields
                    unique_id = parts[0]
                    # find where numbers (Cases/Deaths) start
                    num_idx = [i for i, p in enumerate(parts) if p.isdigit()]
                    if len(num_idx) < 2:
                        continue

                    cases = parts[num_idx[0]]
                    deaths = parts[num_idx[1]]

                    # Dates are right after deaths
                    try:
                        start_date = parts[num_idx[1] + 1]
                        report_date = parts[num_idx[1] + 2]
                    except:
                        continue

                    # Status is next two words "Under ..." or "Under Control"
                    status = " ".join(parts[num_idx[1] + 3:num_idx[1] + 5])

                    # State, District, Disease are the words between ID and Cases
                    meta = parts[1:num_idx[0]]
                    if len(meta) >= 3:
                        state = meta[0]
                        district = meta[1]
                        disease = " ".join(meta[2:])
                    else:
                        continue

                    # Comments = everything after status
                    status_end = num_idx[1] + 5
                    comments = " ".join(parts[status_end:])

                    all_records.append([
                        unique_id, state, district, disease,
                        cases, deaths, start_date, report_date,
                        status, comments
                    ])

        print(f"‚úÖ Processed: {pdf}")

    except Exception as e:
        print(f"‚ùå Error processing {pdf}: {e}")

# Save CSV
df = pd.DataFrame(all_records, columns=[
    "Unique_ID", "State", "District", "Disease",
    "Cases", "Deaths", "Start_Date", "Report_Date",
    "Status", "Comments"
])
df.to_csv(CSV_OUTPUT, index=False)

print(f"üéâ Master CSV saved at: {CSV_OUTPUT}")


Found 259 PDFs
‚úÖ Processed: data\012019.pdf
‚úÖ Processed: data\022019.pdf
‚úÖ Processed: data\032019.pdf
‚úÖ Processed: data\042019.pdf
‚úÖ Processed: data\052019.pdf
‚úÖ Processed: data\062019.pdf
‚úÖ Processed: data\072019.pdf
‚úÖ Processed: data\1036560991697701673.pdf
‚úÖ Processed: data\10534460671663568873.pdf
‚úÖ Processed: data\10751456901629710010.pdf
‚úÖ Processed: data\11273184721641384072.pdf
‚úÖ Processed: data\11468971641556005847.pdf
‚úÖ Processed: data\11684956991704779982.pdf
‚úÖ Processed: data\11984820951671616638.pdf
‚úÖ Processed: data\12385261621641816039.pdf
‚úÖ Processed: data\12542165061598342727.pdf
‚úÖ Processed: data\12628276741640844117.pdf
‚úÖ Processed: data\12830469691632485260.pdf
‚úÖ Processed: data\128660161699611747.pdf
‚úÖ Processed: data\1392278531651211316.pdf
‚úÖ Processed: data\1409615621661929367.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\1430537991672296570.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value


‚úÖ Processed: data\14887303501610440276.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\15879821711692006852.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\16441864901581914600.pdf
‚úÖ Processed: data\16602315821676010289.pdf
‚úÖ Processed: data\1679188201658218211.pdf
‚úÖ Processed: data\17113853891702917780.pdf
‚úÖ Processed: data\17460277921659429678.pdf
‚úÖ Processed: data\17603578201556280184.pdf
‚úÖ Processed: data\17724297961576151118.pdf
‚úÖ Processed: data\17777132131651643473.pdf
‚úÖ Processed: data\17889938321705308095.pdf
‚úÖ Processed: data\19237813091684214647.pdf
‚úÖ Processed: data\20155875121604654822.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\21323410851557989924.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern1' is an invalid float value


‚úÖ Processed: data\21592462931612246615.pdf
‚úÖ Processed: data\21st wk 2020.pdf
‚úÖ Processed: data\2211220741579085810.pdf
‚úÖ Processed: data\22483332931678770810.pdf
‚úÖ Processed: data\23869537881693295660.pdf
‚úÖ Processed: data\24421597211682506215.pdf
‚úÖ Processed: data\25286157331631579632.pdf
‚úÖ Processed: data\25411693011672911468.pdf
‚úÖ Processed: data\2548782891644495164.pdf
‚úÖ Processed: data\25631695521654237028.pdf
‚úÖ Processed: data\25756315101563359659.pdf
‚úÖ Processed: data\25831535421658994103.pdf
‚úÖ Processed: data\26128241981597124552.pdf
‚úÖ Processed: data\26841813001598859859.pdf
‚úÖ Processed: data\26th wk 2023-1.pdf
‚úÖ Processed: data\27385427641583741592.pdf
‚úÖ Processed: data\27714369291652936295.pdf
‚úÖ Processed: data\27th wk 2023-1.pdf
‚úÖ Processed: data\28346714001569322122.pdf
‚úÖ Processed: data\2882454301676010303.pdf
‚úÖ Processed: data\28th wk 2023.pdf
‚úÖ Processed: data\29559503671572417006.pdf
‚úÖ Processed: data\29th wk 2023-1.pdf
‚ú

Cannot set gray non-stroke color because /'Pattern1' is an invalid float value


‚úÖ Processed: data\52195839271608540270.pdf
‚úÖ Processed: data\52349419351625728884.pdf
‚úÖ Processed: data\52nd wk 2023-1.pdf
‚úÖ Processed: data\5334491831681890297.pdf
‚úÖ Processed: data\53945400761634203762.pdf
‚úÖ Processed: data\54136961551628675774.pdf
‚úÖ Processed: data\54588936701614162099.pdf
‚úÖ Processed: data\54855544991676459270.pdf
‚úÖ Processed: data\55569857191561115369.pdf
‚úÖ Processed: data\55850372501688721705.pdf
‚úÖ Processed: data\55963304211589526100.pdf
‚úÖ Processed: data\5598982481677737839.pdf
‚úÖ Processed: data\56953886141677569586.pdf
‚úÖ Processed: data\57192284951607940109.pdf
‚úÖ Processed: data\57227886241578632023.pdf
‚úÖ Processed: data\57241746311700469239.pdf
‚úÖ Processed: data\57351711651607418821.pdf
‚úÖ Processed: data\57704139231597648582.pdf
‚úÖ Processed: data\58051976421592202723.pdf
‚úÖ Processed: data\58135622001555071060.pdf
‚úÖ Processed: data\58269660581685423781.pdf
‚úÖ Processed: data\5840814251577708329.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\58496833651705032329.pdf


Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Pattern13' is an invalid float value
Cannot set gray non-stroke color because /'Pattern14' is an invalid float value
Cannot set gray non-stroke color because /'Pattern15' is an invalid float value
Cannot set gray non-stroke color because /'Pattern16' is an invalid float value
Cannot set gray non-stroke color because /'Pa

‚úÖ Processed: data\59318143211581914621.pdf
‚úÖ Processed: data\59542223571689673947.pdf
‚úÖ Processed: data\59609550111678770735.pdf
‚úÖ Processed: data\5964152161596712701.pdf
‚úÖ Processed: data\59880574731623130107.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\60008466191674111579.pdf
‚úÖ Processed: data\6061046811606368261.pdf
‚úÖ Processed: data\60690508801635853024.pdf
‚úÖ Processed: data\61242793991568706218.pdf
‚úÖ Processed: data\6229883631601365884.pdf
‚úÖ Processed: data\62747270011649328179.pdf
‚úÖ Processed: data\63008264611592915163.pdf
‚úÖ Processed: data\6334518031698667764.pdf
‚úÖ Processed: data\63739555761617691316.pdf
‚úÖ Processed: data\63934703921642751757.pdf
‚úÖ Processed: data\64657729271690964415.pdf
‚úÖ Processed: data\64995502961606195427.pdf
‚úÖ Processed: data\65498445311614162134.pdf
‚úÖ Processed: data\6589945201554276858.pdf
‚úÖ Processed: data\6590096481557832331.pdf
‚úÖ Processed: data\6601227421612941709.pdf
‚úÖ Processed: data\66557506361675679880.pdf
‚úÖ Processed: data\67310881641556708881.pdf
‚úÖ Processed: data\67323893291638446578.pdf
‚úÖ Processed: data\67474475281643264551.pdf
‚úÖ Processed: data\67492866371669284742.pdf
‚úÖ Processed: data\68142848861638769472.pdf
‚úÖ Processed: d

Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\74187654281664790779.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\74204773541614676896.pdf
‚úÖ Processed: data\74743699461571052888.pdf
‚úÖ Processed: data\7494517571658218152.pdf
‚úÖ Processed: data\75110479251574835379.pdf
‚úÖ Processed: data\75398645741622451261.pdf
‚úÖ Processed: data\75493226621600316927.pdf
‚úÖ Processed: data\75891972881670501228.pdf
‚úÖ Processed: data\7600145861611826717.pdf
‚úÖ Processed: data\76245669481599205071.pdf
‚úÖ Processed: data\76345468981602073147.pdf
‚úÖ Processed: data\76464278111586340109.pdf
‚úÖ Processed: data\7691730161563359625.pdf
‚úÖ Processed: data\77063109021572417436.pdf
‚úÖ Processed: data\7712669551559045817.pdf
‚úÖ Processed: data\77905767661628494849.pdf
‚úÖ Processed: data\7803926841638446619.pdf
‚úÖ Processed: data\78048457291640928892.pdf
‚úÖ Processed: data\78391596911671087086.pdf
‚úÖ Processed: data\79268315211637580936.pdf
‚úÖ Processed: data\79431194251666685561.pdf
‚úÖ Processed: data\79615858811603084941.pdf
‚úÖ Processed: data\7996614701600316906.pdf
‚úÖ Processed: d

Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\81792301181601365912.pdf


Cannot set gray non-stroke color because /'Pattern20' is an invalid float value
Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value


‚úÖ Processed: data\8181197221610615473.pdf
‚úÖ Processed: data\82640712431561703101.pdf
‚úÖ Processed: data\82652121441599205047.pdf
‚úÖ Processed: data\82656745161572606554.pdf
‚úÖ Processed: data\82748353081596538437.pdf
‚úÖ Processed: data\83459743991665057580.pdf
‚úÖ Processed: data\84412656471574839783.pdf
‚úÖ Processed: data\8511094351673004835.pdf
‚úÖ Processed: data\85360816881563359501.pdf
‚úÖ Processed: data\87432761971580215568.pdf
‚úÖ Processed: data\8769878311655962403.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value


‚úÖ Processed: data\87996730991684214595.pdf


Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Pattern13' is an invalid float value
Cannot set gray non-stroke color because /'Pattern14' is an invalid float value
Cannot set gray non-stroke color because /'Pattern15' is an invalid float value
Cannot set gray non-stroke color because /'Pattern16' is an invalid float value
Cannot set gray non-stroke color because /'Pattern17' is an invalid float value
Cannot set gray non-stroke color because /'Pattern18' is an invalid float value
Cannot set gray non-stroke color because /'Pattern19' is an invalid float value
Cannot set gray non-stroke color because /'Pattern20' is an invalid float value
Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /

‚úÖ Processed: data\88490139401610000902.pdf
‚úÖ Processed: data\88599375341559045751.pdf
‚úÖ Processed: data\89343299521646644982.pdf
‚úÖ Processed: data\8960781311589282489.pdf
‚úÖ Processed: data\8984621471688550207.pdf
‚úÖ Processed: data\89997464371627471359.pdf
‚úÖ Processed: data\90035950711578899621.pdf
‚úÖ Processed: data\9050581631691147080.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\907026181642679041.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value
Cannot set gray non-stroke color because /'Pattern12' is an invalid float value
Cannot set gray non-stroke color because /'Patter

‚úÖ Processed: data\90708520581607945512.pdf
‚úÖ Processed: data\90829141301611039090.pdf
‚úÖ Processed: data\90932843861683201733.pdf
‚úÖ Processed: data\90987822861641383977.pdf
‚úÖ Processed: data\91997163121586426717.pdf
‚úÖ Processed: data\92677823771621846952.pdf
‚úÖ Processed: data\92837229251680676582.pdf
‚úÖ Processed: data\93858438291632994388.pdf
‚úÖ Processed: data\94317400911604655031.pdf
‚úÖ Processed: data\94446467611669198494.pdf
‚úÖ Processed: data\94994875221624427606.pdf
‚úÖ Processed: data\95714695921576827903.pdf
‚úÖ Processed: data\959182831681282940.pdf
‚úÖ Processed: data\97322230781664263759.pdf
‚úÖ Processed: data\97509421901623653932.pdf
‚úÖ Processed: data\99070210871642594974.pdf
‚úÖ Processed: data\99151176221556005805.pdf
‚úÖ Processed: data\99595557981606195263.pdf
üéâ Master CSV saved at: outbreak_master.csv


In [2]:
import pandas as pd
df = pd.read_csv("outbreak_master.csv")
df

Unnamed: 0,Unique_ID,State,District,Disease,Cases,Deaths,Start_Date,Report_Date,Status,Comments
0,BH/SUP/2019/01/01,Bihar,Supaul,Measles,5,0,02-01-19,06-01-19,outbreak. House,to house survey done. All
1,GJ/MHS/2019/01/05,Gujarat,Mahesana,Diarrheal,12,0,01-01-19,02-01-19,vomiting might,be due to consumption of
2,JH/GDA/2019/01/06,Jharkhand,Godda,Chickenpox,36,0,08-01-19,08-01-19,confirmed as,Chicken Pox by the RRT team
3,KN/HAV/2019/01/09,Karnataka,Haveri,Diarrheal,25,0,31-12-18,01-01-18,survey done.,2 Blood Samples sent to DPHL
4,KN/HAV/2019/01/10,Karnataka,Haveri,Diarrheal,13,0,03-01-19,04-01-19,to house,survey done. 2 Blood samples and 1
...,...,...,...,...,...,...,...,...,...,...
2268,BH/SHK/2019/10/0236,Bihar,Shekhpura,Measles?,9,0,05-03-19,08-03-19,done. Affected,children were in age group
2269,JH/WSI/2019/10/0241,Jharkhand,West,Singhbhum,9,0,06-03-19,07-03-19,presented symptoms,with vomiting and loose
2270,TN/RMN/2019/10/0247,Tami,Nadu,Chikungunya,19,0,05-03-19,10-03-19,DPHL Ramanthpurum;,result awaited. All cases
2271,TN/ERO/2019/10/0248,Tami,Nadu,Erode Chikungunya,31,0,03-03-19,10-03-19,DPHL Erode,2 samples were positive for


In [26]:
import pandas as pd
from geopy.geocoders import Nominatim
import time


# Initialize geolocator
geolocator = Nominatim(user_agent="geoapi")

# Function to get lat/lon
def get_lat_lon(place):
    try:
        location = geolocator.geocode(place + ", India")  # restrict to India if needed
        if location:
            return pd.Series([location.latitude, location.longitude])
    except:
        return pd.Series([None, None])

# Apply with small delay (to avoid API blocking)
df[["Latitude", "Longitude"]] = df["District"].apply(lambda x: get_lat_lon(x))
time.sleep(1)

print(df)


               Unique_ID      State   District            Disease  Cases  \
0      BH/SUP/2019/01/01      Bihar     Supaul            Measles      5   
1      GJ/MHS/2019/01/05    Gujarat   Mahesana          Diarrheal     12   
2      JH/GDA/2019/01/06  Jharkhand      Godda         Chickenpox     36   
3      KN/HAV/2019/01/09  Karnataka     Haveri          Diarrheal     25   
4      KN/HAV/2019/01/10  Karnataka     Haveri          Diarrheal     13   
..                   ...        ...        ...                ...    ...   
565  UP/MUZ/2019/42/1382   Muzaffar      Nagar             Dengue      8   
566  BH/SHK/2019/10/0236      Bihar  Shekhpura           Measles?      9   
567  JH/WSI/2019/10/0241  Jharkhand       West          Singhbhum      9   
568  TN/RMN/2019/10/0247       Tami       Nadu        Chikungunya     19   
569  TN/ERO/2019/10/0248       Tami       Nadu  Erode Chikungunya     31   

     Deaths Start_Date Report_Date              Status  \
0         0   02-01-19    06-

In [27]:
df.head()

Unnamed: 0,Unique_ID,State,District,Disease,Cases,Deaths,Start_Date,Report_Date,Status,Comments,Latitude,Longitude
0,BH/SUP/2019/01/01,Bihar,Supaul,Measles,5,0,02-01-19,06-01-19,outbreak. House,to house survey done. All,,
1,GJ/MHS/2019/01/05,Gujarat,Mahesana,Diarrheal,12,0,01-01-19,02-01-19,vomiting might,be due to consumption of,,
2,JH/GDA/2019/01/06,Jharkhand,Godda,Chickenpox,36,0,08-01-19,08-01-19,confirmed as,Chicken Pox by the RRT team,,
3,KN/HAV/2019/01/09,Karnataka,Haveri,Diarrheal,25,0,31-12-18,01-01-18,survey done.,2 Blood Samples sent to DPHL,,
4,KN/HAV/2019/01/10,Karnataka,Haveri,Diarrheal,13,0,03-01-19,04-01-19,to house,survey done. 2 Blood samples and 1,,


In [None]:
# pdfs_to_water_csv.py
import os
import glob
import re
import pandas as pd
import pdfplumber
from collections import OrderedDict

INPUT_FOLDER = "data_water"         # change to your folder with PDFs
CSV_OUTPUT = "water_quality_master.csv"

# Candidate tail headers (choose by matching length)
TAILS = {
    17: ["pH","EC","CO3","HCO3","Cl","SO4","NO3","PO4","TH","Ca","Mg","Na","K","F","SiO2","TDS","U(ppb)"],
    16: ["pH","EC","CO3","HCO3","Cl","SO4","NO3","PO4","TH","Ca","Mg","Na","K","F","SiO2","TDS"],
    15: ["pH","EC","CO3","HCO3","Cl","SO4","NO3","PO4","TH","Ca","Mg","Na","K","F","TDS"],  # fallback
    18: ["pH","EC","CO3","HCO3","Cl","SO4","NO3","PO4","TH","Ca","Mg","Na","K","F","SiO2","TDS","U(ppb)","EXTRA"]
}

def find_lat_lon_year(line):
    """
    find a pattern: <lat> <lon> <4-digit-year>
    returns (lat, lon, year, match_start, match_end) or None
    """
    # floats or ints; allow optional degree symbols etc
    pattern = re.compile(r'(-?\d+\.\d+|-?\d+)\s+(-?\d+\.\d+|-?\d+)\s+((19|20)\d{2})')
    m = pattern.search(line)
    if not m:
        return None
    return (m.group(1), m.group(2), m.group(3), m.start(), m.end())

def parse_pdf_to_records(pdf_path):
    records = []
    with pdfplumber.open(pdf_path) as doc:
        for page in doc.pages:
            text = page.extract_text()
            if not text:
                continue
            for raw_line in text.splitlines():
                line = raw_line.strip()
                if not line:
                    continue
                # skip header lines
                low = line.lower()
                if low.startswith("well") or ("latitude" in low and "longitude" in low):
                    continue

                res = find_lat_lon_year(line)
                if not res:
                    # sometimes page extraction breaks spaces; skip if no anchor
                    continue
                lat, lon, year, mstart, mend = res

                left = line[:mstart].strip()
                right = line[mend:].strip()

                left_tokens = left.split()
                if len(left_tokens) < 2:
                    continue

                # primary textual fields
                well_id = left_tokens[0]
                s_no = left_tokens[1]
                rest = left_tokens[2:]

                # attempt to assign State, District, Block, Location
                state = rest[0] if len(rest) >= 1 else ""
                district = rest[1] if len(rest) >= 2 else ""
                block = rest[2] if len(rest) >= 3 else ""
                location = " ".join(rest[3:]) if len(rest) >= 4 else (rest[3] if len(rest) == 4 else "")

                # right side: split by whitespace - keep tokens like '<0.1' or 'BDL'
                right_vals = right.split()

                # choose a tail header list based on length
                tail_headers = TAILS.get(len(right_vals))
                if tail_headers is None:
                    # try best-fit: choose the tail with closest length
                    possible_lengths = sorted(TAILS.keys(), key=lambda k: abs(k - len(right_vals)))
                    tail_headers = TAILS[possible_lengths[0]]

                # ensure we don't crash if values fewer/greater: pad or trim
                vals = right_vals[:len(tail_headers)] + [""] * max(0, len(tail_headers) - len(right_vals))

                # Build ordered record
                rec = OrderedDict()
                rec["Well_ID"] = well_id
                rec["S.No"] = s_no
                rec["State"] = state
                rec["District"] = district
                rec["Block"] = block
                rec["Location"] = location
                rec["Latitude"] = lat
                rec["Longitude"] = lon
                rec["Year"] = year

                for h, v in zip(tail_headers, vals):
                    rec[h] = v

                # attach raw_line optionally for troubleshooting
                rec["_raw_line"] = raw_line
                records.append(rec)
    return records

def main():
    pdf_files = glob.glob(os.path.join(INPUT_FOLDER, "*.pdf"))
    print(f"Found {len(pdf_files)} PDFs in {INPUT_FOLDER}")
    all_records = []
    for pdf in pdf_files:
        try:
            recs = parse_pdf_to_records(pdf)
            print(f"  -> {pdf}: {len(recs)} rows parsed")
            all_records.extend(recs)
        except Exception as e:
            print(f"Error processing {pdf}: {e}")

    if not all_records:
        print("No records found. Check extraction heuristics or run with verbose logging.")
        return

    # Build final columns (merge all possible tail headers seen)
    # gather all keys
    cols = list(all_records[0].keys())
    # ensure deterministic ordering: move _raw_line last
    if "_raw_line" in cols:
        cols.remove("_raw_line")
        cols.append("_raw_line")

    df = pd.DataFrame(all_records, columns=cols)
    df.to_csv(CSV_OUTPUT, index=False)
    print(f"Saved {len(df)} rows to {CSV_OUTPUT}")

if __name__ == "__main__":
    main()


Found 5 PDFs in data_water
  -> data_water\2019 water quality data.pdf: 12270 rows parsed
  -> data_water\2020 water quality data.pdf: 5825 rows parsed
  -> data_water\2021 water quality data.pdf: 8009 rows parsed
  -> data_water\2022 water quality data.pdf: 13564 rows parsed
  -> data_water\2023 water quality data.pdf: 16770 rows parsed
Saved 56438 rows to water_quality_master.csv


In [19]:
df1 = pd.read_csv("water_quality_master.csv")
df1.head()

  df1 = pd.read_csv("water_quality_master.csv")


Unnamed: 0,Well_ID,S.No,State,District,Block,Location,Latitude,Longitude,Year,pH,...,NO3,PO4,TH,Ca,Mg,Na,K,F,TDS,_raw_line
0,W254029084355301,1,Himachal,Pradesh,Solan,Nallagarh JAGATPUR,31.1594,76.6785,2019,8.44,...,2.7,<0.1,84,17.0,10.0,39,2.4,0.2,11,W254029084355301 1 Himachal Pradesh Solan Nall...
1,W251908084361501,2,Himachal,Pradesh,Solan,Nallagarh BARUNA,31.154,76.6384,2019,8.4,...,10.0,<0.1,116,10.0,18.0,23,1.0,0.12,12,W251908084361501 2 Himachal Pradesh Solan Nall...
2,W310955076364001,3,Himachal,Pradesh,Solan,Nallagarh BHATOLI,31.1651,76.6082,2019,8.4,...,13.0,<0.1,116,10.0,18.0,28,2.5,0.15,13,W310955076364001 3 Himachal Pradesh Solan Nall...
3,W310143076392701,4,Himachal,Pradesh,Solan,Nallagarh MAGANPURA,31.02,76.65,2019,8.12,...,52.0,<0.1,200,38.0,26.0,68,4.0,0.14,16,W310143076392701 4 Himachal Pradesh Solan Nall...
4,W321645075471501,5,Himachal,Pradesh,Kangra,Nurpur PANJPIR,32.28,75.7914,2019,8.12,...,10.0,<0.1,110,28.0,9.7,26,2.01,0.22,30,W321645075471501 5 Himachal Pradesh Kangra Nur...
