In [13]:
OPENAI_API_KEY =  'YOUR_OPENAI_API'

In [14]:
import os
from openai import OpenAI

In [26]:
import PyPDF2
import pandas as pd
import tiktoken

def count_tokens(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(text))
    return num_tokens

def process_pdf_files(pdf_files):
    data = []  # Create an empty list to store the data
    for pdf in pdf_files:  # Iterate over the PDF files
        with open(pdf, 'rb') as pdf_content:  # Open the PDF file
            pdf_reader = PyPDF2.PdfReader(pdf_content)  # Create a PDF reader object
            for page_num in range(len(pdf_reader.pages)):  # Iterate over all the pages
                page = pdf_reader.pages[page_num]  # Get the current page
                page_text = page.extract_text()  # Extract text from the current page
                print(page_text)
                if page_text:
                    words = page_text.split()  # Split the page text into individual words
                    page_text_join = ' '.join(words)  # Join the words back into a single string
                    page_len = len(page_text_join)
                    div_len = page_len // 2  # Divide the page length by 2
                    page_parts = [page_text_join[i * div_len:(i + 1) * div_len] for i in range(2)]
                    for i, page_part in enumerate(page_parts):  # Iterate over each part
                        data.append({
                            'file name': pdf,
                            'page number': page_num + 1,
                            'page section': i + 1,
                            'content': page_part,
                            'tokens': count_tokens(page_part)
                        })
                        print(page_part)
                else:
                    # Handle pages where no text was extracted
                    for i in range(2):
                        data.append({
                            'file name': pdf,
                            'page number': page_num + 1,
                            'page section': i + 1,
                            'content': '',
                            'tokens': 0
                        })
    return pd.DataFrame(data)


In [27]:
file_names = ['./Document/c4nr06660j.pdf', './Document/c4nr06660j1.pdf']
client = OpenAI(api_key = OPENAI_API_KEY, )

df = process_pdf_files(file_names)
print("Initial Data Extraction Complete")

Nanoscale
COMMUNICATION
Cite this: Nanoscale , 2015, 7, 2260
Received 11th November 2014,
Accepted 22nd December 2014
DOI: 10.1039/c4nr06660j
www.rsc.org/nanoscaleMonodisperse Sr –La2O3hybrid nano ﬁbers for
oxidative coupling of methane to synthesize
C2hydrocarbons †
Jianjun Song,a,bYongnan Sun,a,cRongbin Ba,a,bShuangshuang Huang,a,b
Yonghui Zhao,aJun Zhang,aYuhan Sun*aand Yan Zhu*a
The synergistic e ﬀects from combinations of each component ’s
functionality in hybrid Sr –La2O3nanoﬁbers brought about an
improved catalytic behaviour for oxidative coupling of methane
carried out at high temperatures, which cannot be achieved over
the conventional Sr doped La 2O3spherical catalyst.
Hybrid nanocatalysts with di ﬀerent components arranged in a
controlled structure are widely used in industry since they can
provide an entirely novel activity and selectivity for manyimportant chemical reactions viathe coupling between com-
ponents within the hybrids.
1–3This class of hybrid nano-
materials co

In [16]:
def save_df_to_csv(df, filename):
    df.to_csv(filename, index=False)

def load_df_from_csv(filename):
    df = pd.read_csv(filename)
    df['embedding'] = df['embedding'].apply(ast.literal_eval)

    return df

def add_embedding(df):
    # If the dataframe already has 'embedding' column, return as is
    if 'embedding' in df.columns:
        return df
    # Define a function to get embeddings
    def get_embedding(text):
        response = client.embeddings.create(
            input = text,
            model = "text-embedding-ada-002"
        )
        return response.data[0].embedding
    # Apply the function to the 'content' column
    df['embedding'] = df['content'].apply(get_embedding)
    return df

In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def add_similarity(df, given_embedding):
    def calculate_similarity(row_embedding):
        return cosine_similarity([row_embedding], [given_embedding])[0][0]
    df['similarity'] = df['embedding'].apply(calculate_similarity)
    return df

def select_top_neighbors(df):
    df = df.sort_values('similarity', ascending=False)
    top_neighbors = df.head(20)
    return top_neighbors

In [19]:
def Model_OCM(df):
    max_tokens = 3000
    df.loc[:, 'tokens'] = df['content'].apply(count_tokens)
    summarized = []
    for i, row in df.iterrows():
        content = row['content']
        token_count = row['tokens']
        print(token_count)
        if token_count > max_tokens: # Split the content into chunks of max_tokens
            content_chunks = [' '.join(content.split()[i:i+max_tokens]) for i in range(0,
            token_count, max_tokens)]
        else:
            content_chunks = [content]
        for chunk in content_chunks:
            prompt = f"Context: {chunk} Given tubulation prompt"
            response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": prompt}])
            answer = response.choices[0].message.content
            summarized.append(answer)
    df.loc[:, 'summarized'] = pd.Series(summarized, index=df.index)
    return df

In [20]:
def tabulate_ocm_conditions(df):
    columns = [
        'catalyst name', 'catalyst mass', 'GHSV / mL gcat-1 hr-1', 'Flow / mL min-1', 
        'Temperature / degree C', '% CH4', '% CH4 conversion', '% C2 selectivity'
    ]
    
    # Initialize an empty dataframe with the desired columns
    result_df = pd.DataFrame(columns=columns)
    
    for index, row in df.iterrows():
        summarized = row['summarized']
        
        # Check if the "|" symbol is present in the text string
        if "|" in summarized:
            # Split the text string into lines and ignore the header
            lines = summarized.strip().split("\n")[2:]  # Assuming the first two lines are headers
            
            for line in lines:
                data = [x.strip() for x in line.split("|")]
                if len(data) == len(columns):
                    # Convert the data into a DataFrame and concatenate it with the result DataFrame
                    temp_df = pd.DataFrame([data], columns=columns)
                    result_df = pd.concat([result_df, temp_df], ignore_index=True)
    
    return result_df


In [21]:
import pandas as pd
# Sample data as it might appear in the DataFrame
sample_data = {
    'summarized': [
        """
        catalyst name | catalyst mass | GHSV / mL gcat-1 hr-1 | Flow / mL min-1 | Temperature / degree C | % CH4 | % CH4 conversion | % C2 selectivity
        -------------|---------------|-----------------------|-----------------|-----------------------|-------|------------------|-----------------
        Sr-La2O3     | 0.2 g         | 72000                 | 240             | 500                   | 75    | 35               | 47
        Mg-CaO       | 0.3 g         | 65000                 | 220             | 450                   | 70    | 30               | 50
        """
    ]
}

# Create a DataFrame
sample_df = pd.DataFrame(sample_data)

# Test the function
result = tabulate_ocm_conditions(sample_df)

# Print the result to see if it worked correctly
print(result)


  catalyst name catalyst mass GHSV / mL gcat-1 hr-1 Flow / mL min-1  \
0      Sr-La2O3         0.2 g                 72000             240   
1        Mg-CaO         0.3 g                 65000             220   

  Temperature / degree C % CH4 % CH4 conversion % C2 selectivity  
0                    500    75               35               47  
1                    450    70               30               50  


In [22]:
file_names = ['./Document/c4nr06660j.pdf', './Document/c4nr06660j1.pdf']
client = OpenAI(api_key = OPENAI_API_KEY, )

df = process_pdf_files(file_names)
print("Initial Data Extraction Complete")
        
df.to_csv('extracted_data.csv', index=False)
print("Data saved to CSV.")
        
# Add embeddings to the DataFrame
df = add_embedding(df)
print("Embeddings added.")
        
# If you are using Model_3, Model_2, or any other function that further processes the data
df = Model_OCM(df)
print("Data processed through Model_3.")


Initial Data Extraction Complete
Data saved to CSV.
Embeddings added.
130
86
80
92
153
132
133
121
87
82
37
45
37
49
30
35
18
30
9
8
18
23
Data processed through Model_3.


In [23]:
print(df)

                     file name  page number  section number  \
0    ./Document/c4nr06660j.pdf            1               1   
1    ./Document/c4nr06660j.pdf            1               2   
2    ./Document/c4nr06660j.pdf            2               1   
3    ./Document/c4nr06660j.pdf            2               2   
4    ./Document/c4nr06660j.pdf            3               1   
5    ./Document/c4nr06660j.pdf            3               2   
6    ./Document/c4nr06660j.pdf            4               1   
7    ./Document/c4nr06660j.pdf            4               2   
8    ./Document/c4nr06660j.pdf            5               1   
9    ./Document/c4nr06660j.pdf            5               2   
10  ./Document/c4nr06660j1.pdf            1               1   
11  ./Document/c4nr06660j1.pdf            1               2   
12  ./Document/c4nr06660j1.pdf            2               1   
13  ./Document/c4nr06660j1.pdf            2               2   
14  ./Document/c4nr06660j1.pdf            3            

In [24]:
df.to_csv('filename.csv', index=False)
