In [2]:
# Install required libraries
!pip install pandas tqdm

import pandas as pd
import os
import json
from tqdm import tqdm
from collections import defaultdict

base_path = "./"
#base_path = "/content/drive/MyDrive/Projektmunka Smoking and COVID19"
os.chdir(base_path)





In [3]:
# Load metadata.csv
metadata_path = "metadata.csv"
metadata = pd.read_csv(metadata_path, dtype=str)

# Define smoking-related keywords (expand as needed)
smoking_keywords = [
    "smoking", "smoker", "smoke", "ecigarett", "cigarett",  "tobacco", "cigarette", "nicotine",
    "vaping", "vape", "e-cigarette", "smoker", "cigar", "weed", "marijuana"
]

# Filter papers where title/abstract contains smoking-related terms
filtered_papers = metadata[
    metadata["title"].str.lower().str.contains('|'.join(smoking_keywords), na=False) |
    metadata["abstract"].str.lower().str.contains('|'.join(smoking_keywords), na=False)
].copy()

print(f"Found {len(filtered_papers)} smoking-related papers")


Found 11354 smoking-related papers


In [4]:
columns_to_keep = ['cord_uid', 'title', 'abstract', 'publish_time', 'source_x', 'authors', 'pdf_json_files', 'pmc_json_files']

filtered_papers = filtered_papers[columns_to_keep]

In [6]:
filtered_papers.head()
filtered_papers.to_csv("smoking_related_records_from_cord19_by_antal_svec.csv", index=False)

In [None]:
filtered_papers.info()

In [19]:
def extract_sections_from_json(json_path):
    """Extract sections from JSON files with error handling"""
    sections = defaultdict(list)
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            for para in data['body_text']:
                section = para['section'].lower()
                text = para['text']
                if 'intro' in section:
                    sections['introduction'].append(text)
                elif 'method' in section:
                    sections['methods'].append(text)
                elif 'result' in section:
                    sections['results'].append(text)
            if 'back_matter' in data:
                sections['references'] = [item['text'] for item in data['back_matter']]
    except Exception as e:
        # Print only if it's not a "file not found" error
        if "No such file" not in str(e):
            print(f"Error parsing {json_path}: {e}")
    return sections

def get_full_text(row):
    """Handle both PDF and PMC JSON files with priority to PDF"""
    # Try PDF JSON first
    if pd.notna(row['pdf_json_files']):
        for json_path in row['pdf_json_files'].split('; '):
            full_path = os.path.join(pdf_json_dir, json_path.strip())
            if os.path.exists(full_path):
                return extract_sections_from_json(full_path)
    
    # Fallback to PMC JSON
    if pd.notna(row['pmc_json_files']):
        for json_path in row['pmc_json_files'].split('; '):
            full_path = json_path.strip()
            if os.path.exists(full_path):
                return extract_sections_from_json(full_path)
    
    return defaultdict(list)  # Return empty dict if no files found

# Apply with progress bar
tqdm.pandas(desc="Extracting full text sections")
filtered_papers['full_text'] = filtered_papers.progress_apply(get_full_text, axis=1)

# Convert defaultdict to regular dict for better serialization
filtered_papers['full_text'] = filtered_papers['full_text'].apply(dict)

print(f"Successfully extracted full text for {filtered_papers['full_text'].apply(bool).sum()} papers")

Extracting full text sections: 100%|█████████████████████████████████████████████| 11354/11354 [00:17<00:00, 630.82it/s]

Successfully extracted full text for 3961 papers





In [20]:
filtered_papers.head()

Unnamed: 0,cord_uid,title,abstract,publish_time,source_x,authors,pdf_json_files,pmc_json_files,full_text
8,8qnrcgnk,Heme oxygenase-1 and carbon monoxide in pulmon...,"Heme oxygenase-1 (HO-1), an inducible stress p...",2003-08-07,PMC,"Slebos, Dirk-Jan; Ryter, Stefan W; Choi, Augus...",document_parses/pdf_json/faaf1022ccfe93b032c56...,document_parses/pmc_json/PMC193681.xml.json,{'introduction': ['The heme oxygenase-1/carbon...
41,qva0jt86,Relevance of human metapneumovirus in exacerba...,BACKGROUND AND METHODS: Human metapneumovirus ...,2005-12-21,PMC,"Rohde, G; Borg, I; Arinir, U; Kronsbein, J; Ra...",document_parses/pdf_json/4ba79e54ecf81b30b5646...,document_parses/pmc_json/PMC1334186.xml.json,{'methods': ['Three different groups were stud...
43,bnnl700a,Public awareness of risk factors for cancer am...,BACKGROUND: The present study aimed to provide...,2006-01-10,PMC,"Inoue, Manami; Iwasaki, Motoki; Otani, Tetsuya...",document_parses/pdf_json/a78fd1b34372e1e54bf2a...,document_parses/pmc_json/PMC1351169.xml.json,{'methods': ['The study was conducted as a par...
473,ft5wl70x,Involvement of microRNAs in physiological and ...,"To date, at least 900 different microRNA (miRN...",2010-11-23,PMC,"Tomankova, Tereza; Petrek, Martin; Kriegova, Eva",document_parses/pdf_json/b97de55ba907c3b1f3048...,document_parses/pmc_json/PMC3001429.xml.json,{'references': []}
507,1h6jz1h5,Plant Plastid Engineering,Genetic material in plants is distributed into...,2010-11-03,PMC,"Wani, Shabir H.; Haider, Nadia; Kumar, Hitesh;...",document_parses/pdf_json/79979652a864cef3a4134...,document_parses/pmc_json/PMC3048312.xml.json,{'introduction': ['Genetic material in plants ...


In [21]:
# Save curated data to CSV/Parquet for later use
filtered_papers.to_csv("smoking_covid_curated.csv", index=False)