In [None]:
import os
import pandas as pd
import json

from tqdm import tqdm

import re
import unicodedata

from nltk.corpus import stopwords
import nltk

from llama_index.core import Settings
from llama_index.core import Document
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core import load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.memory import ChatMemoryBuffer

from matplotlib import pyplot as plt
import torch
print(torch.cuda.is_available())  # Should print True

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device " + device)
base_path = "./"
pdf_json_dir = 'document_parses/pdf_json'
pmc_json_dir = 'document_parses/pmc_json'
#base_path = "/content/drive/MyDrive/Projektmunka Smoking and COVID19"
os.chdir(base_path)
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
metadata_path = "metadata.csv"
metadata = pd.read_csv(metadata_path, dtype=str)

# Define smoking-related keywords (expand as needed)
smoking_keywords = [
    "smoking", "smoker", "smoke", "ecigarett", "cigarett",  "tobacco", "cigarette", "nicotine",
    "vaping", "vape", "e-cigarette", "smoker", "cigar", "weed", "marijuana"
]

# Filter papers where title/abstract contains smoking-related terms
filtered_papers = metadata[
    metadata["title"].str.lower().str.contains('|'.join(smoking_keywords), na=False) |
    metadata["abstract"].str.lower().str.contains('|'.join(smoking_keywords), na=False)
].copy()

print(f"Found {len(filtered_papers)} smoking-related papers")

In [None]:
columns_to_keep = ['cord_uid', 'title', 'abstract', 'publish_time', 'source_x', 'authors', 'pdf_json_files', 'pmc_json_files']

filtered_papers = filtered_papers[columns_to_keep]

In [None]:
def extract_body_text(json_path):
    """Extract and concatenate all 'text' fields from 'body_text' in a JSON file."""
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            return ' '.join(para['text'] for para in data.get('body_text', []))
    except Exception as e:
        # Optionally print or log the error
        return None

def get_full_text(row):
    # Try PDF JSON first
    if pd.notna(row['pdf_json_files']):
        for json_path in row['pdf_json_files'].split('; '):
            full_path = os.path.join(base_path, json_path.strip())
            if os.path.exists(full_path):
                return extract_body_text(full_path)
    return None  # Return empty dict if no files found

In [None]:
tqdm.pandas(desc="Extracting full text sections")
filtered_papers['full_text'] = filtered_papers.progress_apply(get_full_text, axis=1)

In [None]:
filtered_papers.info()

In [None]:
filtered_papers.head()

In [None]:
filtered_papers = filtered_papers.dropna(subset=['title', 'abstract', 'full_text'])
filtered_papers.info()

In [None]:
print(filtered_papers.iloc[0].to_dict())

In [None]:
filtered_papers['combined_text'] = (
    filtered_papers['title'].fillna('') + '. ' +
    filtered_papers['abstract'].fillna('') + '. ' +
    filtered_papers['full_text'].fillna('')
)

# Basic statistics
filtered_papers['text_length'] = filtered_papers['combined_text'].str.len()
print(filtered_papers['text_length'].describe())

# Example anomaly filter: drop if text is too short or too long
min_length = 200   # adjust as needed
max_length = 30000 # adjust as needed
filtered_papers = filtered_papers[
    (filtered_papers['text_length'] >= min_length) &
    (filtered_papers['text_length'] <= max_length)
].copy()

In [None]:
filtered_papers.head()

# Data Validation

In [None]:
#create a copy for Data validation stuff
df= pd.DataFrame()
df=filtered_papers.copy()
df.rename(columns={'full_text': 'article_text'}, inplace=True)
df.rename(columns={'combined_text': 'full_text'}, inplace=True)
df

## Validate Keyword Relevance

### Check if documents actually discuss COVID + smoking:

In [None]:
# Define keywords
covid_terms = ["covid", "sars-cov-2", "coronavirus"]
smoking_terms = smoking_keywords

# Filter rows containing at least 1 COVID + 1 smoking term
def is_relevant(text):
    if isinstance(text, str):
        has_covid = any(term in text.lower() for term in covid_terms)
        has_smoking = any(term in text.lower() for term in smoking_terms)
        return has_covid and has_smoking
    return False
# Apply to abstract/body text
df['is_relevant'] = df['full_text'].apply(is_relevant) 
print(f"Relevant documents: {df['is_relevant'].sum()}/{len(df)}")

In [None]:
# Plot relevance
plt.figure(figsize=(6, 4))
df['is_relevant'].value_counts().plot(kind='bar', color=['red', 'green'])
plt.title("Relevance of Documents (COVID + Smoking)")
plt.show()

## Publication Dates

In [None]:
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')

# Plot publications over time
plt.figure(figsize=(20, 7))
df['publish_time'].dt.year.value_counts().sort_index().plot(kind='line', marker='o')
plt.title("Publications per Year")
plt.xlabel("Year")
plt.ylabel("Count")
plt.grid()
plt.show()

## NLP-Based Validation (Topic Coherence)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Extract top keywords for COVID+smoking docs
tfidf = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_matrix = tfidf.fit_transform(df[df['is_relevant']]['full_text'])
top_keywords = pd.Series(tfidf.get_feature_names_out()).sample(10, random_state=42)

print("Top Keywords in Relevant Docs:")
print(top_keywords.tolist())

## Semantic Similarity Validation

In [None]:
!pip install spacy

## Semantic Similarity Validation

In [None]:
import spacy
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Initialize tqdm for pandas
tqdm.pandas()

# Load the spaCy model
print("Loading spaCy model...")
nlp = spacy.load("en_core_web_lg")

# Define the query and compute its vector
query = "Impact of smoking on COVID-19 severity"
query_vec = nlp(query).vector.reshape(1, -1)

# Function to validate semantic similarity
def validate_semantic_similarity(text):
    if isinstance(text, str):
        doc = nlp(text)
        doc_vec = doc.vector.reshape(1, -1)
        return cosine_similarity(query_vec, doc_vec)[0][0]
    return 0

# Apply with progress bar
print("Calculating semantic similarities...")
df['semantic_score'] = df['full_text'].progress_apply(validate_semantic_similarity)

# Sort results with progress indication
print("Sorting results...")
result = df[['title', 'semantic_score']].sort_values('semantic_score', ascending=False)

# Display the result
print("\nTop results:")
print(result.head())

In [None]:
import os
import pandas as pd
import numpy as np

# Configuration
output_folder = "text_data_chunks"
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Load your DataFrame (replace with your actual data)
# df = pd.read_csv("your_data.csv")

# Split data into 10 chunks
num_files = 40
chunks = np.array_split(df['full_text'].dropna(), num_files)

# Save each chunk to a separate .txt file
for i, chunk in enumerate(chunks):
    file_path = os.path.join(output_folder, f"text_chunk_{i+1}.txt")
    
    with open(file_path, 'w', encoding='utf-8') as f:
        for text in chunk:
            if isinstance(text, str) and text.strip():
                f.write(text.strip() + '\n\n')  # Add double newline between entries
                
    print(f"Saved {len(chunk)} entries to {file_path}")

print(f"\nAll files saved to '{output_folder}' folder")

## Top 10 Terms in Data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize vectorizer (ignore stopwords and terms shorter than 2 chars)
vectorizer = CountVectorizer(stop_words='english', min_df=2, token_pattern=r'(?u)\b[A-Za-z]{3,}\b')
X = vectorizer.fit_transform(df['full_text'].astype(str))

# Sum counts for each term
term_counts = X.sum(axis=0)
term_freq = [(word, term_counts[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
term_freq_sorted = sorted(term_freq, key=lambda x: x[1], reverse=True)

# Display top 10
print("Top 10 Terms:")
for term, freq in term_freq_sorted[:10]:
    print(f"{term}: {freq}")

In [None]:
import matplotlib.pyplot as plt

# Extract data for plotting
top_10_terms = term_freq_sorted
terms = [term for term, freq in top_10_terms]
frequencies = [freq for term, freq in top_10_terms]

# Plot
plt.figure(figsize=(10, 5))
plt.bar(terms, frequencies, color='skyblue')
plt.title("Top 10 Most Frequent Terms in COVID-19/Smoking Literature")
plt.xlabel("Terms")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()