In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path 
import dask.dataframe as dd
from helper.text_preprocessor import preprocess_doc 

pd.set_option('max_columns', 30)


## Scan a directory of envelope txt files, create a dict of env file name: text as string pairs. Remove new line characters and then convert to pandas df

In [2]:
#scan a file directory and return a list of pdf files to iterate over

env_path = Path(r'D:\Python ML\Envelope-key-words\Data\Raw\Envelopes')


env_dict = {}
with os.scandir(env_path) as it:
    for entry in it:
        if entry.name.endswith(".txt") and entry.is_file():
            with open(entry, 'r') as file:
                data = file.read().replace('\n', '')
                env_dict[entry.name] = data

envs_df = pd.DataFrame(list(env_dict.items()), columns=['ID', 'envelopeOCR'])

In [3]:
envs_df['ID'] = envs_df['ID'].str.split('.').str[0].str.strip()
envs_df['ID'] = envs_df['ID'].astype('string')
#envs_df.set_index('ID', inplace=True)

In [4]:
envs_df.sample(2)

Unnamed: 0,ID,envelopeOCR
3636,ENV06946,"Open File EnvelopeNo. 6946ML 47, ML 3?7 AND ML..."
3958,ENV07700002,South Australia Business UnitSantos LtdA.C.N. ...


In [5]:
envs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7839 entries, 0 to 7838
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           7839 non-null   string
 1   envelopeOCR  7839 non-null   object
dtypes: object(1), string(1)
memory usage: 122.6+ KB


## Open mer-env bucket index as pandas df

In [6]:
# first need to convert encoding of the index csv file, then open
#env_load = pd.read_csv(r'D:\Python ML\Envelope-key-words\Data\Raw\mer-env AWS Bucket content index_20200227.csv', encoding ="ISO-8859-1")
#env_load.to_csv(r'D:\Python ML\Envelope-key-words\Data\Interim\mer-env AWS Bucket content index_20200227.csv',encoding='utf-8', index=False)
env_index = pd.read_csv(r'D:\Python ML\Envelope-key-words\Data\Interim\mer-env AWS Bucket content index_20200227.csv')

In [7]:
env_index.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10133 entries, 0 to 10132
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Reference               10133 non-null  object 
 1   Category                10133 non-null  object 
 2   Title                   10133 non-null  object 
 3   Publication Date        9128 non-null   object 
 4   Author                  7618 non-null   object 
 5   Prepared by             5647 non-null   object 
 6   Source                  10133 non-null  object 
 7   Collation               8479 non-null   object 
 8   Format                  10028 non-null  object 
 9   Client                  2705 non-null   object 
 10  Broad Subject           8776 non-null   object 
 11  Subject Terms           8809 non-null   object 
 12  Abstract                6601 non-null   object 
 13  Notes                   4006 non-null   object 
 14  Tenement                7642 non-null 

In [8]:
env_index.head(2)

Unnamed: 0,Reference,Category,Title,Publication Date,Author,Prepared by,Source,Collation,Format,Client,Broad Subject,Subject Terms,Abstract,Notes,Tenement,Licensee,Mine Name,Map Sheet,Locality,Geol. Province,Drillhole,Assays,Stratigraphy,Date Added,Download Document Size
0,Env 00001,Company petroleum exploration licence reports,Wilkatana borehole grid surface plan and cross...,14-Aug-56,"King, D.",South Australia. Department of Mines;Geologica...,South Australia. Department of Mines. Open fil...,"Fiche range: 2, 1 plans,",Hard Copy Digital,Santos Ltd;Director of Mines,Petroleum exploration;Drilling,Sedimentary basins;Petroleum migration;Hydroca...,No text. Data (prepared by the author) consist...,,OEL00007,Santos Ltd,,PORT AUGUSTA;6433I,Wilkatana,Pirie Torrens Basin;Arrowie Basin,,,,20/08/1986,1567.0
1,Env 00002,Company petroleum exploration licence reports,Interim report of the geological investigation...,30-Jun-56,"Scott, D.C.;Wopfner, H.;Grasso, R.",Geosurveys of Australia Ltd,South Australia. Department of Mines. Open fil...,"Total fiche: 2, Total pages: 4, 1 plans, 1 reps",Digital Hard Copy,,Petroleum exploration;Structural geology,Photointerpretation;Geological mapping,Survey aimed to obtain broad structural pictur...,,,Santos Ltd,,PORT AUGUSTA;6433I;ORROROO;6533;PARACHILNA;653...,Flinders Ranges,Arrowie Basin;Adelaide Geosyncline,,,,20/08/1986,186.0


In [9]:
# correct the reference col to match the formatting of the 'ID' col in envs_df
temp_env1 = env_index['Reference'].str.split('p:').str[0].str.split(';').str[0].str.upper().str.split(' ').str[:2].str.join('').str.replace('/','')
temp_env2 = env_index['Reference'].str.split('p:').str[0].str.upper().str.split(' ').str[2:].str.join(' ').str.strip()
temp_env3 = temp_env1.str.cat(temp_env2, sep= ' ')
temp_env3 = temp_env3.astype('string')

In [10]:
env_index['Reference'] = temp_env3.str.strip()
#env_index.set_index('Reference', inplace= True)
env_index.head(2)
    

Unnamed: 0,Reference,Category,Title,Publication Date,Author,Prepared by,Source,Collation,Format,Client,Broad Subject,Subject Terms,Abstract,Notes,Tenement,Licensee,Mine Name,Map Sheet,Locality,Geol. Province,Drillhole,Assays,Stratigraphy,Date Added,Download Document Size
0,ENV00001,Company petroleum exploration licence reports,Wilkatana borehole grid surface plan and cross...,14-Aug-56,"King, D.",South Australia. Department of Mines;Geologica...,South Australia. Department of Mines. Open fil...,"Fiche range: 2, 1 plans,",Hard Copy Digital,Santos Ltd;Director of Mines,Petroleum exploration;Drilling,Sedimentary basins;Petroleum migration;Hydroca...,No text. Data (prepared by the author) consist...,,OEL00007,Santos Ltd,,PORT AUGUSTA;6433I,Wilkatana,Pirie Torrens Basin;Arrowie Basin,,,,20/08/1986,1567.0
1,ENV00002,Company petroleum exploration licence reports,Interim report of the geological investigation...,30-Jun-56,"Scott, D.C.;Wopfner, H.;Grasso, R.",Geosurveys of Australia Ltd,South Australia. Department of Mines. Open fil...,"Total fiche: 2, Total pages: 4, 1 plans, 1 reps",Digital Hard Copy,,Petroleum exploration;Structural geology,Photointerpretation;Geological mapping,Survey aimed to obtain broad structural pictur...,,,Santos Ltd,,PORT AUGUSTA;6433I;ORROROO;6533;PARACHILNA;653...,Flinders Ranges,Arrowie Basin;Adelaide Geosyncline,,,,20/08/1986,186.0


In [11]:
env_index.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10133 entries, 0 to 10132
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Reference               10133 non-null  string 
 1   Category                10133 non-null  object 
 2   Title                   10133 non-null  object 
 3   Publication Date        9128 non-null   object 
 4   Author                  7618 non-null   object 
 5   Prepared by             5647 non-null   object 
 6   Source                  10133 non-null  object 
 7   Collation               8479 non-null   object 
 8   Format                  10028 non-null  object 
 9   Client                  2705 non-null   object 
 10  Broad Subject           8776 non-null   object 
 11  Subject Terms           8809 non-null   object 
 12  Abstract                6601 non-null   object 
 13  Notes                   4006 non-null   object 
 14  Tenement                7642 non-null 

In [12]:
# Merge two data frames into single dataset

#envelopes_df = pd.merge(envs_df,env_index,how='inner',left_on='ID',right_on='Reference')

# save merged df as interim dataset

#envelopes_df.to_csv(Path(r'D:\Python ML\Envelope-key-words\Data\Interim\merged_indexed_envelopes_20201003.csv'))

envelopes_df = pd.read_csv(r'D:\Python ML\Envelope-key-words\Data\Interim\merged_indexed_envelopes_20201003.csv', dtype='string')

In [11]:
envelopes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8554 entries, 0 to 8553
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              8554 non-null   string
 1   ID                      8554 non-null   string
 2   envelopeOCR             8553 non-null   string
 3   Reference               8554 non-null   string
 4   Category                8554 non-null   string
 5   Title                   8554 non-null   string
 6   Publication Date        7699 non-null   string
 7   Author                  6820 non-null   string
 8   Prepared by             4519 non-null   string
 9   Source                  8554 non-null   string
 10  Collation               7375 non-null   string
 11  Format                  8496 non-null   string
 12  Client                  2386 non-null   string
 13  Broad Subject           7528 non-null   string
 14  Subject Terms           7648 non-null   string
 15  Abst

In [13]:
print(envelopes_df['Format'].value_counts())
print('\n')
print(envelopes_df['Category'].value_counts())


Hard Copy Digital               2883
Digital                         2642
Digital Hard Copy               2616
Hard Copy Microfiche Digital     202
Digital Hard Copy Hard Copy      143
Hard Copy                          5
Microfiche Hard Copy Digital       5
Name: Format, dtype: Int64


Company mineral exploration licence reports                5619
Company petroleum exploration licence reports              1840
DSD publications                                            451
Non-DSD publications, theses and miscellaneous reports      408
Company mining program                                      109
Geothermal exploration licence reports                       78
Departmental publications                                    33
External publications, theses and miscellaneous reports      14
Mineral Production Licence report                             2
Name: Category, dtype: Int64


In [14]:
envelopes_df['Tenement'].sample(10)

1965                                              EL00184
733                                     EPP00008;OEL00022
2357                                                 <NA>
7967                                             EPP00032
1661    EL00035;ML04051;ML04123;MC00182;MC00249;MC0651...
1785                                              EL00109
5188                                      EL01095;EL01114
7871                                              EL04840
7739                                                 <NA>
480                                              OEL00022
Name: Tenement, dtype: string

In [15]:
envelopes_df[envelopes_df['Tenement'].str.contains('EL03546', na=False)]

Unnamed: 0.1,Unnamed: 0,ID,envelopeOCR,Reference,Category,Title,Publication Date,Author,Prepared by,Source,Collation,Format,Client,Broad Subject,Subject Terms,Abstract,Notes,Tenement,Licensee,Mine Name,Map Sheet,Locality,Geol. Province,Drillhole,Assays,Stratigraphy,Date Added,Download Document Size
6675,6675,ENV10624,"Oren File EnvelopeNo. 10,624THE EYRE PENINSULA...",ENV10624,Company mineral exploration licence reports,Data release - as updated [made at SA Director...,,"Drown, C.G.;McAvaney, D.J.;Caon, J.;Barnes, J....",Geosolutions Pty Ltd,South Australia. Department of Primary Industr...,"Total pages: 283, 10 appx, figures, 8 ref, tab...",Digital,Quasar Resources Pty Ltd,Mineral exploration - SA;Drilling;Sedimentary ...,Gold exploration;Base metal exploration;Uraniu...,"During the sixth year of the project, licensee...",This release to the public of the subject mine...,EL03296;EL03501;EL03546;EL03564;EL03700;EL0370...,Peninsula Resources Limited;Quasar Resources P...,,STREAKY BAY;5832I;5832IV;5833;YARDEA;5932;5933...,West-central Eyre Peninsula;Kyancutta;Minippa;...,Gawler Craton;Eucla Basin,CBA001 - CBA023;(306226 - 306334);CBM0001 - CB...,eU3O8;Au;Al;Ca;Fe;K;Mg;Mn;Zn;Ag;As;Bi;Cu;Pb;Sc...,Hiltaba Suite;Sleaford Complex;Hutchison Group...,27/10/2016,
7406,7406,ENV11825,"Ozen File EnvelopeNo. 11,825EL 3546, EL 3833 A...",ENV11825,Company mineral exploration licence reports,"Mount Ive, Corrobinnie and Lake Acraman (part ...",Feb-11,"Badenhorst, T.",,South Australia. Department of Primary Industr...,"Total pages: 13, 1 appx, 6 fig, 1 table, 1 reps",Digital,,Mineral exploration - SA;Geochemistry,Gold exploration;Base metal exploration;Epithe...,The exploration strategy for the Joint Venture...,A 493 sq km combined area was relinquished in ...,EL03546;EL03833;EL04186,Eyre Energy Pty Ltd;Adelaide Resource Ltd;Quas...,,GAIRDNER;6034II;YARDEA;6033II;6033III;6133III;...,Northern Eyre Peninsula;Southern Gawler Ranges...,Gawler Craton;Coulta Domain;Eucla Basin,,Au;Ag;As;Ca;Cu;Fe;Mg;Mn;Mo;Ni;Pb;Zn,,3/08/2011,1155.0
7672,7672,ENV12134,"Oren File EnvelopeNo. 12,134EL 3546 AND EL 383...",ENV12134,Company mineral exploration licence reports,Mount Ive and Corrobinnie (part of the Eyre Pe...,28-Feb-11,"Hewett, A.H.;Caon, J.",,South Australia. Department of Primary Industr...,"Total pages: 27, 6 plans, appendices, 7 fig, t...",Digital,,Mineral exploration - SA;Geophysics;Drilling;S...,Uranium exploration;Sedimentary ores;Palaeocha...,Exploration of the Tertiary Thurlga Palaeochan...,,EL03546;EL03833,Eyre Energy Pty Ltd;Adelaide Resource Ltd;Quas...,,YARDEA;6033II;6033III;6133III;KIMBA;6131IV,Northern Eyre Peninsula;Southern Gawler Ranges...,Gawler Craton;Coulta Domain;Eucla Basin,MVA045;MVA046;MVA060 - MVA095,Au;Al;Ca;Fe;K;Mg;Mn;V;Zn;Ag;As;Bi;Cu;Pb;Sc;Se;...,,2/08/2011,7502.0
7735,7735,ENV12202,"Open File EnvelopeNo. 12.202EL 3546, EL 3705, ...",ENV12202,Company mineral exploration licence reports,"Mount Ive, Waddikee Rocks, Corrobinnie, Lake A...",Sep-11,"Badenhorst, T.",Adelaide Resources Ltd,South Australia. Department of Primary Industr...,"Total pages: 31, 1 appx, 16 fig, tables, 1 reps",Digital,,Mineral exploration - SA;Geochemistry,Gold exploration;Base metal exploration;Epithe...,A group of 13 exploration licences located on ...,See also the first and second partial relinqui...,EL03546;EL03705;EL03833;EL04186;EL04690,Eyre Energy Pty Ltd;Olliver Geological Service...,,GAIRDNER;STREAKY BAY;YARDEA;KIMBA,Northern Eyre Peninsula;Kimba;Minnipa;Kyancutt...,Gawler Craton,,Ag;As;Au;Ca;Cr;Cu;Fe;Mg;Mn;Ni;Pb;Zn,,22/11/2011,1512.0
8035,8035,ENV12534,"Open File EnvelopeNo. 12,534EL 3546 / 479?MOUN...",ENV12534,Company mineral exploration licence reports,Mount Ive (part of the Eyre Peninsula Uranium ...,Feb-14,"Manly, M.",Daishsat Geodetic Surveyors;GeoSolutions Pty Ltd,Government of South Australia. Department for ...,"Total pages: 20, 7 appx, 4 fig, 8 tables, 1 reps",Digital,Heathgate Resources Ltd,Mineral exploration - SA;Geophysics;Drilling;G...,Gold exploration;Base metal exploration;Epithe...,The stated exploration strategy for the Joint ...,"During the period 21/11/2006 until 19/2/2013, ...",EL03546;EL04792,Peninsula Resources Ltd;Adelaide Exploration L...,,YARDEA;6033II;6033III;6132IV,Northern Eyre Peninsula;Southern Gawler Ranges...,Gawler Craton;Coulta Domain;Eucla Basin,MVA001 - MVA059;(284625 - 284681),U;Th;Cu;Pb;Ag;As;Bi;Zn;Mn;Sc;Se;Al;K;Ca;Fe;Mg;...,Gawler Range Volcanics;Narlaby Formation,17/04/2014,1931.0


In [16]:
envelopes_dropna = envelopes_df.dropna(subset=['envelopeOCR'])
envelopes_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8553 entries, 0 to 8553
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              8553 non-null   string
 1   ID                      8553 non-null   string
 2   envelopeOCR             8553 non-null   string
 3   Reference               8553 non-null   string
 4   Category                8553 non-null   string
 5   Title                   8553 non-null   string
 6   Publication Date        7698 non-null   string
 7   Author                  6819 non-null   string
 8   Prepared by             4518 non-null   string
 9   Source                  8553 non-null   string
 10  Collation               7374 non-null   string
 11  Format                  8495 non-null   string
 12  Client                  2386 non-null   string
 13  Broad Subject           7527 non-null   string
 14  Subject Terms           7647 non-null   string
 15  Abst

In [36]:
envelopes_ddf = dd.from_pandas(envelopes_dropna, chunksize=1)

In [37]:
envelopes_ddf.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 28 entries, Unnamed: 0 to Download Document Size
dtypes: string(28)

In [38]:
envelopes_ddf['normalised_ENV'] = envelopes_ddf['envelopeOCR'].apply(preprocess_doc, check_spelling=False, meta=envelopes_ddf.envelopeOCR).compute(scheduler='processes')

In [39]:
envelopes_df_processed = envelopes_ddf.compute()

In [40]:
envelopes_df_processed.to_csv(r'D:\Python ML\Envelope-key-words\Data\Interim\merged_preprocessed_20201010.csv')

In [42]:
envelopes_df_processed.head()

Unnamed: 0.1,Unnamed: 0,ID,envelopeOCR,Reference,Category,Title,Publication Date,Author,Prepared by,Source,Collation,Format,Client,Broad Subject,Subject Terms,Abstract,Notes,Tenement,Licensee,Mine Name,Map Sheet,Locality,Geol. Province,Drillhole,Assays,Stratigraphy,Date Added,Download Document Size,normalised_ENV
0,0,ENV00002,2CONTENTS ENVELOPE 2Geosurveys of Australta Lt...,ENV00002,Company petroleum exploration licence reports,Interim report of the geological investigation...,30-Jun-56,"Scott, D.C.;Wopfner, H.;Grasso, R.",Geosurveys of Australia Ltd,South Australia. Department of Mines. Open fil...,"Total fiche: 2, Total pages: 4, 1 plans, 1 reps",Digital Hard Copy,,Petroleum exploration;Structural geology,Photointerpretation;Geological mapping,Survey aimed to obtain broad structural pictur...,,,Santos Ltd,,PORT AUGUSTA;6433I;ORROROO;6533;PARACHILNA;653...,Flinders Ranges,Arrowie Basin;Adelaide Geosyncline,,,,20/08/1986,186.0,contents envelope geosurveys australta ltd san...
1,1,ENV00003,2CONTENTS ENVELOPE 3Australian Pacific Oil Com...,ENV00003,Company petroleum exploration licence reports,Murray and Otway Basins and offshore extension...,1961,,Burmal Oil Co.,South Australia. Department of Mines. Open fil...,"Total fiche: 2, Total pages: 54, 4 plans, 8 fi...",Hard Copy Digital,,Petroleum exploration,Petroleum potential,,Contains a review of the hydrocarbon potential...,OEL00025,Australian Pacific Oil Co. Pty Ltd,,PENOLA;NARACOORTE;PINNAROO;CHOWILLA;OLARY;6932...,Loxton;Caroline;Mt Gambier,Otway Basin;Murray Basin,,,,20/08/1986,1994.0,contents envelope australian pacific oil compa...
2,2,ENV00003,2CONTENTS ENVELOPE 3Australian Pacific Oil Com...,ENV00003,Company petroleum exploration licence reports,Report on the geology and oil and gas prospect...,26-Sep-60,"Schneeberger, W.F.",Ball Associates,South Australia. Department of Mines. Open fil...,"Fiche range: 1-2, Page range: 3-52, 4 plans, 8...",Hard Copy Digital,,Petroleum exploration,Marine petroleum exploration;Literature review...,,,OEL00025,Australian Pacific Oil Co. Pty Ltd,,PENOLA;NARACOORTE;PINNAROO;CHOWILLA;OLARY;6932...,South Australia,Otway Basin;Murray Basin,,,,30/09/1998,1994.0,contents envelope australian pacific oil compa...
3,3,ENV00004,DEPARTMENT OFMINES AND ENERGYOPEN FILE ENVELOP...,ENV00004,Company petroleum exploration licence reports,Penola 1. Subsurface stratigraphy and micropal...,1961,"Ludbrook, N.H.",,South Australia. Department of Mines. Open fil...,"Total fiche: 2, Total pages: 20, 2 plans, 3 reps",Hard Copy Digital,,Petroleum exploration;Palaeontology;Drilling,Micropalaeontology;Biostratigraphy;Palynology;...,,"Contains RB 668, fiche: 1-2, p: 9-20. Also inc...",OEL00022,Oil Development NL;General Exploration Co. of ...,,PENOLA;7023II,"Penola;Hd Penola, secn 500",Otway Basin,Penola 1,,,29/09/1998,5249.0,department ofmine energyopen file envelope ser...
4,4,ENV00005,DEPARTMENT OFMINES AND ENERGYOPEN FILE ENVELOP...,ENV00005,Company petroleum exploration licence reports,Penola 1. Palaeontology.,1961,"Evans, P.R.;White, M.E.",,South Australia. Department of Mines. Open fil...,"Total fiche: 1, Total pages: 10, 0 plans, 2 reps",Hard Copy Digital,,Palaeontology;Petroleum exploration;Exploratio...,Biostratigraphy;Palynology;Drilling;Explorator...,,,OEL00022,Oil Development NL;General Exploration Co. of ...,,PENOLA;7023II,,Otway Basin,Penola 1,,,24/09/1998,1805.0,department ofmine energyopen file envelope ser...


In [44]:
envelopes_df_processed.normalised_ENV.iloc[0]

'contents envelope geosurveys australta ltd santos ltd report interim report geological investigation carriedout central flinder ranges geological map western margin centra flinderstenement tenement holderth june plan rangesnot related pgs otosurkeys ofe aust limit saatoscentral finderspengesfgeological map western margino centaal flinder rangesfor office use onlygeosurvey australia limitedgeological geophysical consultantsmanaging director sprigg sc aus interim report geological inestigation carry central flindersranges field observationsd scott dr wopfner photogeologyr grasso introduction area cover survey square mile thewestern margin central flinder range portion wilkatana willochra hawker yadlamalka cotahena military sheet include area object survey obtain broad structural picture partof flinder range adjacent pirie torren basin ja study structuralenviroment cambrian sediment carry may projectedwestward basin possibly correlate geophysical datathe geology interpret aerial photogra

In [12]:
# create a smaller subset dataset
#envelopes_sub = envelopes_df.iloc[5500:7500]
#envelopes_sub.to_csv(r'D:\Python ML\Envelope-key-words\Data\Interim\subset_preprocessed.csv')

## create dataset that only contains envelopes that have an abstract, and normalise the abstract for NLP

In [17]:
merged_processed = pd.read_csv(r'D:\Python ML\Envelope-key-words\Data\Interim\merged_preprocessed_20201010.csv')

In [18]:
merged_processed.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,envelopeOCR,Reference,Category,Title,Publication Date,Author,Prepared by,Source,Collation,Format,Client,Broad Subject,Subject Terms,Abstract,Notes,Tenement,Licensee,Mine Name,Map Sheet,Locality,Geol. Province,Drillhole,Assays,Stratigraphy,Date Added,Download Document Size,normalised_ENV
0,0,0,ENV00002,2CONTENTS ENVELOPE 2Geosurveys of Australta Lt...,ENV00002,Company petroleum exploration licence reports,Interim report of the geological investigation...,30-Jun-56,"Scott, D.C.;Wopfner, H.;Grasso, R.",Geosurveys of Australia Ltd,South Australia. Department of Mines. Open fil...,"Total fiche: 2, Total pages: 4, 1 plans, 1 reps",Digital Hard Copy,,Petroleum exploration;Structural geology,Photointerpretation;Geological mapping,Survey aimed to obtain broad structural pictur...,,,Santos Ltd,,PORT AUGUSTA;6433I;ORROROO;6533;PARACHILNA;653...,Flinders Ranges,Arrowie Basin;Adelaide Geosyncline,,,,20/08/1986,186.0,contents envelope geosurveys australta ltd san...
1,1,1,ENV00003,2CONTENTS ENVELOPE 3Australian Pacific Oil Com...,ENV00003,Company petroleum exploration licence reports,Murray and Otway Basins and offshore extension...,1961,,Burmal Oil Co.,South Australia. Department of Mines. Open fil...,"Total fiche: 2, Total pages: 54, 4 plans, 8 fi...",Hard Copy Digital,,Petroleum exploration,Petroleum potential,,Contains a review of the hydrocarbon potential...,OEL00025,Australian Pacific Oil Co. Pty Ltd,,PENOLA;NARACOORTE;PINNAROO;CHOWILLA;OLARY;6932...,Loxton;Caroline;Mt Gambier,Otway Basin;Murray Basin,,,,20/08/1986,1994.0,contents envelope australian pacific oil compa...


In [19]:
merged_processed = merged_processed.dropna(subset=['Abstract'])
merged_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5894 entries, 0 to 8552
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              5894 non-null   int64  
 1   Unnamed: 0.1            5894 non-null   int64  
 2   ID                      5894 non-null   object 
 3   envelopeOCR             5894 non-null   object 
 4   Reference               5894 non-null   object 
 5   Category                5894 non-null   object 
 6   Title                   5894 non-null   object 
 7   Publication Date        5630 non-null   object 
 8   Author                  5364 non-null   object 
 9   Prepared by             3294 non-null   object 
 10  Source                  5894 non-null   object 
 11  Collation               5732 non-null   object 
 12  Format                  5886 non-null   object 
 13  Client                  2137 non-null   object 
 14  Broad Subject           5892 non-null   

In [20]:
abstract_ddf = dd.from_pandas(merged_processed, chunksize=1)
abstract_ddf.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 30 entries, Unnamed: 0 to normalised_ENV
dtypes: object(27), float64(1), int64(2)

In [21]:
abstract_ddf['normalised_Abstract'] = abstract_ddf['Abstract'].apply(preprocess_doc, check_spelling=False, meta=abstract_ddf.Abstract).compute(scheduler='processes')
abstract_df_processed = abstract_ddf.compute()

In [22]:
abstract_df_processed.drop(['Unnamed: 0','Unnamed: 0.1'],axis=1).to_csv(r'D:\Python ML\Envelope-key-words\Data\Interim\processed_env_abstracts_20201125.csv')

In [23]:
abstract_df_processed.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,envelopeOCR,Reference,Category,Title,Publication Date,Author,Prepared by,Source,Collation,Format,Client,Broad Subject,...,Abstract,Notes,Tenement,Licensee,Mine Name,Map Sheet,Locality,Geol. Province,Drillhole,Assays,Stratigraphy,Date Added,Download Document Size,normalised_ENV,normalised_Abstract
0,0,0,ENV00002,2CONTENTS ENVELOPE 2Geosurveys of Australta Lt...,ENV00002,Company petroleum exploration licence reports,Interim report of the geological investigation...,30-Jun-56,"Scott, D.C.;Wopfner, H.;Grasso, R.",Geosurveys of Australia Ltd,South Australia. Department of Mines. Open fil...,"Total fiche: 2, Total pages: 4, 1 plans, 1 reps",Digital Hard Copy,,Petroleum exploration;Structural geology,...,Survey aimed to obtain broad structural pictur...,,,Santos Ltd,,PORT AUGUSTA;6433I;ORROROO;6533;PARACHILNA;653...,Flinders Ranges,Arrowie Basin;Adelaide Geosyncline,,,,20/08/1986,186.0,contents envelope geosurveys australta ltd san...,survey aim obtain broad structural picture sho...
7,7,7,ENV00007,CONTENTS ENVELOPE 7TENEMENT O.E.L. 24 - St. Vi...,ENV00007,Company petroleum exploration licence reports,Preliminary review of oil and gas possibilitie...,1961,"Sprigg, R.C.",,South Australia. Department of Mines. Open fil...,"Total fiche: 2, Total pages: 65, 0 plans, refe...",Hard Copy Digital,,Petroleum exploration;Geophysics,...,Includes geological logs of the following bore...,Earlier draft of report in Env 8. Differs by c...,OEL00024,Geosurveys of Australia Pty Ltd,,ADELAIDE;6528;6529;MAITLAND;6428;6429;BARKER;6...,Gulf Saint Vincent,Stansbury Basin;St Vincent Basin,Croydon Bore;Pethicks Bore;Inkerman Balaklava ...,,,31/12/1983,1973.0,content envelope tenement st vincents gulf gra...,include geological log follow bore croydon bor...


In [37]:
merged_processed['Abstract'].iloc[5893]

'The principle objective of this project is to determine the timing of the fracture-filling cements in the Warburton Basin utilizing fluid inclusions microthermometry for some representative samples. A secondary objective was to identify possible sources of fluids from which fracture-filling cement are generated. This report forms part of a collaborative project funded by Mine and Energy South Australia (MESA) represented by the National Centre for Petroleum Geology and Geophysics (NCPGG), the University of Adelaide.'

In [39]:
abstract_df_processed['normalised_Abstract'].iloc[5893]

'principle objective project determine timing fracture fill cement warburton basin utilize fluid inclusion microthermometry representative sample secondary objective identify possible source fluid fracture fill cement generate report form part collaborative project fund mine energy south australia mesa represent national centre petroleum geology geophysics ncpgg university adelaide'