# PoC to extract paragraphs from lines

In [13]:
# pip stuff needed
#%pip install openpyxl
#%pip install nltk

In [14]:
# Imports
import pandas as pd
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aleksander.jakobsen\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
path_to_file = "../data/Norway - Diskos reports_5.xlsx"
df = pd.read_excel(path_to_file)
columns = df.columns
columns

Index(['_id', 'filename', 'page', 'content', 'possible_language', 'langdetect',
       'content_could_be_natural_language', 'content_scrubbed_light',
       'content_scrubbed_light_could_be_natural_language', 'WELLBORE NAME',
       'WELL NAME', 'TYPE', 'INFO ITEM GROUP TYPE', 'INFO ITEM TYPE', 'TITLE',
       'DESCRIPTION', 'CREATOR BA NAME', 'Short Dataset Id',
       'Required Dataset', 'ROW CREATED DATE', 'PUBLIC', 'PUBLIC ACCESS',
       'RELEASE DATE', 'FILE FORMAT', 'Size', 'DATA ORGANIZATION',
       'DATA COLLECTION', 'CREATION PROCESS', 'DATA DOMAIN', 'REMARK',
       'UNLOAD FILE PATH', 'UNLOAD FILE NAME', 'INFORMATION ITEM ID',
       'Dataset Id', 'Last Modified', 'Update Time'],
      dtype='object')

In [16]:
df.dtypes

_id                                                  object
filename                                             object
page                                                  int64
content                                              object
possible_language                                    object
langdetect                                           object
content_could_be_natural_language                      bool
content_scrubbed_light                               object
content_scrubbed_light_could_be_natural_language       bool
WELLBORE NAME                                        object
WELL NAME                                            object
TYPE                                                 object
INFO ITEM GROUP TYPE                                 object
INFO ITEM TYPE                                       object
TITLE                                                object
DESCRIPTION                                          object
CREATOR BA NAME                         

Filters:
- "INFO ITEM TYPE" on "DISCOVERY_EVALUATION_REPORT"
- "content_could_be_natural_language" = True
- "possible_language" = "en"



In [24]:
def concatonate_docs(df):
    """
    Concatenates lines with same _id
    params: df (DataFrame) - The DataFrame to process
    returns: DataFrame with concatenated content by '_id'
    """
    grouped_content = df.groupby('_id')['content'].apply(lambda x: ''.join(x.astype(str)))
    result_df = grouped_content.reset_index(name='concatenated_content')
    return result_df

def define_passages(df, global_passage_id, passage_length):
    """
    Splits a document into passages of sentences.
    
    Expects df on format "_id", "concatenated_content"
    
    Params:
    - df (DataFrame): DataFrame with the documents concatenated by '_id'
    - global_passage_id (int): Starting passage ID for the entire data processing
    - passage_length (int): The number of sentences in each passage

    Returns:
    - DataFrame with the columns "_id", "concatenated_content", "Passage_id", "passage"
    - New global_passage_id after processing the given DataFrame
    """
    # Initialize the tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    passages_data = []

    # Iterate through each document
    for index, row in df.iterrows():
        doc_id = row['_id']
        content = row['concatenated_content']
        sentences = tokenizer.tokenize(content)
        
        for i in range(0, len(sentences), passage_length):
            passage = ' '.join(sentences[i:i+passage_length])
            passages_data.append([doc_id, global_passage_id, passage])
            global_passage_id += 1

    passages_df = pd.DataFrame(passages_data, columns=['_id', 'Passage_id', 'passage'])
    return passages_df, global_passage_id


def base_df(df, passage_length):
    """
    Process the DataFrame to return passages for all unique "INFO ITEM TYPE"

    Params:
    - df (DataFrame): Original DataFrame with different "INFO ITEM TYPE"s
    - passage_length (int): Number of sentences that make up a passage

    Returns:
    - DataFrame with "INFO_ITEM_TYPE", "_id", "passage_id", "passage"
    """
    global_passage_id = 0
    result_dfs = []  # List to hold intermediate DataFrames

    # Get unique 'INFO ITEM TYPE'
    info_item_types = df['INFO ITEM TYPE'].unique()

    # Process each unique 'INFO ITEM TYPE'
    for item_type in info_item_types:
        # Subset the dataframe by 'INFO ITEM TYPE'
        df_filtered = df[(df['INFO ITEM TYPE'] == item_type) &
                         (df['content_could_be_natural_language'] == True) &
                         (df['possible_language'] == 'en')]\
                         .reset_index(drop=True)
        
        concat_df = concatonate_docs(df_filtered)
        passage_df, global_passage_id = define_passages(concat_df, global_passage_id, passage_length)
        
        # Add back the 'INFO ITEM TYPE' to be part of the result DataFrame
        passage_df['INFO_ITEM_TYPE'] = item_type
        result_dfs.append(passage_df)

    # Concatenate all intermediate DataFrames
    final_df = pd.concat(result_dfs).reset_index(drop=True)
    
    # Reorder columns to match the desired format
    final_df = final_df[['INFO_ITEM_TYPE', '_id', 'Passage_id', 'passage']]

    return final_df

In [21]:
desired_df = base_df(df,passage_length=10) 

In [25]:
# Assuming `df` is the DataFrame obtained from the base_df function
disc_eval_df  = desired_df[desired_df['INFO_ITEM_TYPE'] == 'DISCOVERY_EVALUATION_REPORT']
disc_eval_df

Unnamed: 0,INFO_ITEM_TYPE,_id,Passage_id,passage
130481,DISCOVERY_EVALUATION_REPORT,59d3b7de417e13085ff1dbee2ce826ff2fe23165,130481,V FORMATION TESTER S.P.E. PL N5 Ended O.O.H.F....
130482,DISCOVERY_EVALUATION_REPORT,59d3b7de417e13085ff1dbee2ce826ff2fe23165,130482,14 mn Sampling chamber pressure reading at sur...
130483,DISCOVERY_EVALUATION_REPORT,59d3b7de417e13085ff1dbee2ce826ff2fe23165,130483,trip ? Reason for testing: Oil sample for PVT ...
130484,DISCOVERY_EVALUATION_REPORT,601b0da72cc6a2ec5015e66636cd167a4d3802fb,130484,LIST OF CONTENT Abstract I. Introduction 2. Ar...
130485,DISCOVERY_EVALUATION_REPORT,601b0da72cc6a2ec5015e66636cd167a4d3802fb,130485,Appendix C: Core listing + core log. Appendix ...
...,...,...,...,...
130703,DISCOVERY_EVALUATION_REPORT,fc177ae8d5cf8ff470803445387f91e294ee0df7,130703,Analysts Name G. W. -COOPERasp lea alio Immo a...
130704,DISCOVERY_EVALUATION_REPORT,fc177ae8d5cf8ff470803445387f91e294ee0df7,130704,- ' - . - . . - . Average Range Uncorr for ...
130705,DISCOVERY_EVALUATION_REPORT,fc177ae8d5cf8ff470803445387f91e294ee0df7,130705,-... - . . . iIMO II= OM NM MI 11111 MI Mill M...
130706,DISCOVERY_EVALUATION_REPORT,fc177ae8d5cf8ff470803445387f91e294ee0df7,130706,2C 4814m 28.6.76 08.00-12.30 - - 10 litres mud...


In [None]:
# def concatonate_docs(df):
#     """
#     Concatonates lines with same _id
#     params:
#     returns:
#     """
#     grouped_content = df.groupby('_id')['content'].apply(lambda x: ''.join(x.astype(str)))
#     # Reset the index to turn the Series back into a DataFrame
#     result_df = grouped_content.reset_index(name='concatenated_content')
#     return result_df

# def define_passages(df, global_passage_id, passage_length):
#     """
#     Splits a document into passages of sentences.
    
#     Expects df on format "_id", "concatenated_content"
    
#     Params:
#     - df: DataFrame with the documents concatenated by '_id'
#     - passage_size: The number of sentences in each passage (default is 10)

#     Returns:
#     - DataFrame with the columns "_id", "concatenated_content", "Passage_id", "passage"
#     """
#     # Initialize the tokenizer
#     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#     # Prepare a list to store the passages data
#     passages_data = []


#     # Iterate through each document
#     for index, row in df.iterrows():
#         doc_id = row['_id']
#         content = row['concatenated_content']
        
#         # Tokenize the document into sentences
#         sentences = tokenizer.tokenize(content)

#         # Split sentences into passages
#         for i in range(0, len(sentences), passage_size):
#             passage = ' '.join(sentences[i:i+passage_size])
#             passages_data.append([doc_id, global_passage_id, passage])
#             global_passage_id += 1

#     # Create a new DataFrame with the passages data
#     passages_df = pd.DataFrame(passages_data, columns=['_id', 'Passage_id', 'passage'])

#     return passages_df, global_passage_id


# def  base_df(df,passage_length):

#     #filter input df
#     df_filtered =  df[(df['INFO ITEM TYPE'] == 'DISCOVERY_EVALUATION_REPORT') & (df['content_could_be_natural_language'] == True) & (df['possible_language'] == 'en')].reset_index()
#     concat_df = concatonate_docs(df_filtered)
#     global_passage_id = 0
#     passage_df, global_passage_id = define_passages(concat_df,global_passage_id=global_passage_id,passage_length=passage_length)
#     return passage_df
