# Data pre-processing

## Split all the papers

In [13]:
import re
import pandas as pd

In [22]:
with open("C:/Users/showr/OneDrive - Johns Hopkins/Documents/GitHub/lit_review_LLM/abstract.txt", 'r',encoding='utf-8') as file:
    text = file.read()

In [147]:
pd.set_option('display.max_colwidth', None)

In [109]:
text[:2000]

'1. Continuum (Minneap Minn). 2017 Feb;23(1, Cerebrovascular Disease):15-39. doi: \n10.1212/CON.0000000000000416.\n\nStroke Epidemiology and Risk Factor Management.\n\nGuzik A, Bushnell C.\n\nPURPOSE OF REVIEW: Death from stroke has decreased over the past decade, with \nstroke now the fifth leading cause of death in the United States. In addition, \nthe incidence of new and recurrent stroke is declining, likely because of the \nincreased use of specific prevention medications, such as statins and \nantihypertensives. Despite these positive trends in incidence and mortality, \nmany strokes remain preventable. The major modifiable risk factors are \nhypertension, diabetes mellitus, tobacco smoking, and hyperlipidemia, as well as \nlifestyle factors, such as obesity, poor diet/nutrition, and physical \ninactivity. This article reviews the current recommendations for the management \nof each of these modifiable risk factors.\nRECENT FINDINGS: It has been documented that some blood pressur

In [24]:
# Split the content based on double newlines followed by a number and a period
papers = re.split(r'\n\n\d+\. ', text)

# Paper1 will have an extra '1. '
papers[0] = papers[0][3:]

In [25]:
papers[1]

'J Physiol. 2020 Nov;598(22):5039-5062. doi: 10.1113/JP279754. Epub 2020 Oct 12.\n\nEffects of e-cigarettes and vaping devices on cardiac and pulmonary physiology.\n\nTsai M(1), Byun MK(2)(3)(4), Shin J(2)(3), Crotty Alexander LE(2)(3).\n\nAuthor information:\n(1)Division of Pulmonary, Critical Care and Sleep Medicine, Department of \nInternal Medicine, The Ohio State University, Columbus, OH, USA.\n(2)Pulmonary and Critical Care Section, VA San Diego Healthcare System, La \nJolla, CA, USA.\n(3)Division of Pulmonary, Critical Care and Sleep Medicine, Department of \nMedicine, University of California San Diego (UCSD), La Jolla, CA, USA.\n(4)Division of Pulmonology, Department of Internal Medicine, Gangnam Severance \nHospital, Yonsei University College of Medicine, Seoul, South Korea.\n\nE-cigarette aerosols are exceedingly different from conventional tobacco smoke, \ncontaining dozens of chemicals not found in cigarette smoke. It is highly likely \nthat chronic use of e-cigarettes wil

## Extraction


In [26]:
def extract_paper_info(text):
    # Initialize a list to store the extracted information
    extracted_info = []

     # Process each paper individually
    for paper in papers:
        # Find all occurrences of double newlines
        double_newlines = [match.start() for match in re.finditer(r'\n\n', paper)]

        # Extract title
        title = "Title not found"
        if len(double_newlines) > 1:
            title = paper[double_newlines[0]:double_newlines[1]].strip()

        # Extract authors
        authors = "Authors not found"
        if len(double_newlines) > 2:
            authors = paper[double_newlines[1]:double_newlines[2]].strip()

        # Extract DOI
        doi_match = re.search(r'\nDOI:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', paper, re.IGNORECASE)
        doi = doi_match.group(1).strip() if doi_match else "DOI not found"

        # Extract abstract
        abstract = "Abstract not found"
        if '\n\nCopyright' in paper:
            # Find the position of 'Copyright'
            copyright_position = paper.find('\n\nCopyright')
            # Find the last \n\n before the 'Copyright'
            last_double_newline_before_copyright = paper.rfind('\n\n', 0, copyright_position)
            if last_double_newline_before_copyright != -1:
                abstract = paper[last_double_newline_before_copyright:copyright_position].strip()

        elif '\n\n©' in paper:
            # Find the position of '©'
            copyright_position = paper.find('\n\n©')
            # Find the last \n\n before the '©'
            last_double_newline_before_copyright = paper.rfind('\n\n', 0, copyright_position)
            if last_double_newline_before_copyright != -1:
                abstract = paper[last_double_newline_before_copyright:copyright_position].strip()

        elif '\n\nDOI:' in paper:
            # Find the position of 'DOI:'
            doi_position = paper.find('\n\nDOI: ')
            # Find the last \n\n before the 'DOI:'
            last_double_newline_before_doi = paper.rfind('\n\n', 0, doi_position)
            if last_double_newline_before_doi != -1:
                abstract = paper[last_double_newline_before_doi:doi_position].strip()

        # Remove any newline characters in the abstract
        abstract = re.sub(r'\n', ' ', abstract)

        if 'Author information: ' in abstract:
          abstract = 'Abstract not found'

        elif 'Comment ' in abstract:
          abstract = 'Abstract not found'

        elif len(abstract.split()) < 5:
          abstract = 'Abstract not found'

        # Append the extracted information to the list
        extracted_info.append({
            "Title": title,
            "Authors": authors,
            "DOI": doi,
            "Abstract": abstract
        })

    return extracted_info

In [40]:
# Extract information
extracted_info = extract_paper_info(text)

# Create a DataFrame from the list of extracted information
df = pd.DataFrame(extracted_info)

# Print the DataFrame
df.loc[400:500]

Unnamed: 0,Title,Authors,DOI,Abstract
400,Evaluation of the Tobacco Heating System 2.2. ...,"Oviedo A(1), Lebrun S(2), Kogel U(2), Ho J(1),...",10.1016/j.yrtph.2016.11.004,The toxicity of a mentholated version of the T...
401,Electronic Cigarettes as a Cause of Stuttering...,"Alzahrani AM(1), Basalelah JH(1), Alarifi MS(1...",10.12659/AJCR.935716,BACKGROUND Having painful intermittent penile ...
402,Association of 1 Vaping Session With Cellular ...,"Kelesidis T(1), Tran E(2), Nguyen R(2), Zhang ...",10.1001/jamapediatrics.2021.2351,This randomized clinical crossover trial evalu...
403,Cardiovascular injury induced by tobacco produ...,"Conklin DJ(1), Schick S(2), Blaha MJ(3), Carll...",10.1152/ajpheart.00591.2018,Although substantial evidence shows that smoki...
404,PBPK modeling characterization of potential ac...,"More SL(1), Thornton SA(2), Maskrey JR(2), Sha...",10.1080/08958378.2020.1720867,Objective: Ethanol is used as a solvent for fl...
...,...,...,...,...
496,Saliva cotinine levels in users of electronic ...,"Etter JF, Bullen C.",10.1183/09031936.00066011,Abstract not found
497,Human vasculature-on-a-chip with macrophage-me...,"Ohashi K(1), Hayashida A(2), Nozawa A(3), Mats...",10.1016/j.tiv.2023.105582,Heated tobacco products (HTPs) are expected to...
498,A new challenge: suicide attempt using nicotin...,"Schipper EM(1), de Graaff LC, Koch BC, Brkic Z...",10.1111/bcp.12495,Abstract not found
499,Case 38-2019: A 20-Year-Old Man with Dyspnea a...,"Hallowell RW(1), Feldman MB(1), Little BP(1), ...",10.1056/NEJMcpc1909628,Abstract not found


In [55]:
df.to_csv('extracted.csv',index=False)

In [170]:
df1 = df.loc[400:420]

# LLM

In [165]:
from openai import OpenAI
from key import my_sk

In [171]:
# Function to get GPT response
def get_gpt_response(abstract):
    client = OpenAI(api_key = my_sk)
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[{"role": "user", "content": f"Describe the study type with only!!!!!!! 5 words: \n\n{abstract}"}],
        max_tokens=6,
        stream=False
    )
    return response.choices[0].message.content

In [172]:
df1.loc[:, 'Study_Type'] = df1['Abstract'].apply(get_gpt_response)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.loc[:, 'Study_Type'] = df1['Abstract'].apply(get_gpt_response)


In [173]:
pd.DataFrame(df1.loc[:, 'Study_Type'])

Unnamed: 0,Study_Type
400,Inhalation toxicity study on
401,Case Report on Stuttering
402,Randomized clinical crossover trial.
403,Cardiovascular injury biomarkers
404,Pharmacokinetic modeling of
405,Cross-sectional observational study with regression
406,Short-term tobacco product-switching
407,Forensic case report with autopsy
408,Prospective observational study on biom
409,Workshop on biomarkers of


In [169]:
df1.to_csv('output.csv',index=False)