# PubMed Baseline Search for Quaternary Ammonium Compounds

Before running this notebook, ensure the PubMed Baseline database has been downloaded locally. 
This code rapidly downloads this using parallel processing: https://github.com/ScottCoffin/pubmed-baseline-mirror

This notebook:
1. Unpacks PubMed `.xml.gz` baseline files.
2. Loads a search term list from a `.txt` file.
3. Searches for matches using regular expressions.
4. Optionally extracts article metadata.
5. Saves results to a CSV file.


In [None]:
import os
import gzip
import shutil
import re
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from pathlib import Path


In [None]:
# Define paths
source_dir = Path("pubmed-baseline-mirror/data/downloads")
output_dir = Path("pubmed-baseline-mirror/data/unpacked")
search_terms_file = Path("search_terms.txt")  # path to your .txt file with search terms (one per line)

output_dir.mkdir(parents=True, exist_ok=True)

## Unpack

In [4]:
from tqdm import tqdm

# Unpack .xml.gz files with progress reporting
gz_files = [f for f in source_dir.iterdir() if f.suffix == ".gz"]

for fname in tqdm(gz_files, desc="Unpacking .xml.gz files"):
    target_path = output_dir / fname.with_suffix('').name
    if not target_path.exists():
        with gzip.open(fname, 'rb') as f_in:
            with open(target_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)


Unpacking .xml.gz files: 100%|██████████| 1274/1274 [16:41<00:00,  1.27it/s] 


## Load Search terms

In [5]:
# Load search terms from .txt file and build regex
with open(search_terms_file, "r", encoding="utf-8") as f:
    query_terms = [line.strip() for line in f if line.strip() and not line.strip().startswith("#")]

search_pattern = re.compile(r"|".join(re.escape(term) for term in query_terms), re.IGNORECASE)
print(f"Loaded {len(query_terms)} search terms.")

Loaded 1 search terms.


## Run Search

In [None]:
from multiprocessing.dummy import Pool as ThreadPool
from tqdm import tqdm
import re

# Load search terms
with open(search_terms_file, "r", encoding="utf-8") as f:
    query_terms = [line.strip() for line in f if line.strip() and not line.startswith("#")]
search_pattern = re.compile("|".join(re.escape(term) for term in query_terms), re.IGNORECASE)

# Collect XML files
xml_files = [str(f) for f in output_dir.glob("*.xml")]

def search_file(filepath):
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            contents = f.read()
            if search_pattern.search(contents):
                return filepath
    except Exception:
        return None

# Thread pool (use ~4–8 threads depending on system)
with ThreadPool(8) as pool:
    results = list(tqdm(pool.imap(search_file, xml_files), total=len(xml_files), desc="Threaded search"))

# Filter matched files
matched_files = [f for f in results if f]
print(f"✅ Found {len(matched_files)} matching files.")


Threaded search:  55%|█████▍    | 696/1274 [09:39<23:06,  2.40s/it]  

In [None]:
# Optional: extract article metadata
articles = []

for fname in tqdm(matched_files, desc="Extracting article data"):
    with open(output_dir / fname, "r", encoding="utf-8", errors="ignore") as f:
        soup = BeautifulSoup(f.read(), "lxml")
        for article in soup.find_all("pubmedarticle"):
            title = article.find("articletitle")
            abstract = article.find("abstract")
            pmid = article.find("pmid")
            articles.append({
                "pmid": pmid.text if pmid else None,
                "title": title.text if title else "",
                "abstract": abstract.text if abstract else ""
            })

df = pd.DataFrame(articles)
df.to_csv("pubmed_qac_matches.csv", index=False)
print(f"Saved {len(df)} matched articles to pubmed_qac_matches.csv.")


# 🔬 Advanced Analysis & Insights

Let's dive deep into the data with comprehensive analytical capabilities!

In [None]:
# Install additional packages for advanced analysis
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ {package} installed successfully")
    except subprocess.CalledProcessError:
        print(f"❌ Failed to install {package}")

# Install packages for advanced analysis
packages = [
    "wordcloud", "plotly", "networkx", "scikit-learn", 
    "textblob", "seaborn", "matplotlib", "nltk"
]

for package in packages:
    install_package(package)

In [None]:
# Import libraries for advanced analysis
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from collections import Counter, defaultdict
from wordcloud import WordCloud
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

In [None]:
# 📊 Enhanced Metadata Extraction
print("🔍 Extracting enhanced metadata from matched articles...")

enhanced_articles = []
qac_mentions = []
author_data = []
journal_data = []

for fname in tqdm(matched_files, desc="Enhanced extraction"):
    filepath = Path(fname) if isinstance(fname, str) else fname
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            soup = BeautifulSoup(f.read(), "lxml")
            
            for article in soup.find_all("pubmedarticle"):
                # Basic info
                title_elem = article.find("articletitle")
                abstract_elem = article.find("abstract")
                pmid_elem = article.find("pmid")
                
                title = title_elem.text if title_elem else ""
                abstract = abstract_elem.text if abstract_elem else ""
                pmid = pmid_elem.text if pmid_elem else None
                
                if not title and not abstract:
                    continue
                
                # Enhanced metadata
                year = None
                journal = ""
                authors = []
                keywords = []
                mesh_terms = []
                
                # Extract publication year
                pub_date = article.find("pubdate") or article.find("articledate")
                if pub_date:
                    year_elem = pub_date.find("year")
                    if year_elem:
                        try:
                            year = int(year_elem.text)
                        except ValueError:
                            pass
                
                # Extract journal
                journal_elem = article.find("journal")
                if journal_elem:
                    title_elem = journal_elem.find("title")
                    if title_elem:
                        journal = title_elem.text
                
                # Extract authors
                author_list = article.find("authorlist")
                if author_list:
                    for author in author_list.find_all("author"):
                        lastname = author.find("lastname")
                        forename = author.find("forename")
                        if lastname:
                            name = lastname.text
                            if forename:
                                name = f"{forename.text} {name}"
                            authors.append(name)
                            author_data.append({
                                'pmid': pmid,
                                'author': name,
                                'year': year
                            })
                
                # Extract keywords
                keyword_list = article.find("keywordlist")
                if keyword_list:
                    for keyword in keyword_list.find_all("keyword"):
                        keywords.append(keyword.text)
                
                # Extract MeSH terms
                mesh_list = article.find("meshheadinglist")
                if mesh_list:
                    for mesh in mesh_list.find_all("meshheading"):
                        descriptor = mesh.find("descriptorname")
                        if descriptor:
                            mesh_terms.append(descriptor.text)
                
                # Find QAC mentions in text
                full_text = f"{title} {abstract}".lower()
                qac_matches = search_pattern.findall(full_text)
                
                for match in qac_matches:
                    qac_mentions.append({
                        'pmid': pmid,
                        'compound': match,
                        'year': year,
                        'journal': journal
                    })
                
                # Store enhanced article data
                enhanced_articles.append({
                    "pmid": pmid,
                    "title": title,
                    "abstract": abstract,
                    "year": year,
                    "journal": journal,
                    "authors": "; ".join(authors),
                    "author_count": len(authors),
                    "keywords": "; ".join(keywords),
                    "mesh_terms": "; ".join(mesh_terms),
                    "text_length": len(full_text),
                    "qac_mention_count": len(qac_matches)
                })
                
                journal_data.append({
                    'pmid': pmid,
                    'journal': journal,
                    'year': year
                })
                
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
        continue

# Create enhanced dataframes
df_enhanced = pd.DataFrame(enhanced_articles)
df_qac_mentions = pd.DataFrame(qac_mentions)
df_authors = pd.DataFrame(author_data)
df_journals = pd.DataFrame(journal_data)

print(f"✅ Enhanced extraction complete!")
print(f"📄 Articles: {len(df_enhanced)}")
print(f"💊 QAC mentions: {len(df_qac_mentions)}")
print(f"👥 Author entries: {len(df_authors)}")
print(f"📰 Journal entries: {len(df_journals)}")

# Display sample of enhanced data
print("\n📋 Sample of enhanced data:")
print(df_enhanced[['pmid', 'title', 'year', 'journal', 'author_count', 'qac_mention_count']].head())

In [None]:
# 📈 Temporal Analysis & Publication Trends

# Filter data with valid years
df_with_years = df_enhanced.dropna(subset=['year']).copy()
df_with_years = df_with_years[(df_with_years['year'] >= 1950) & (df_with_years['year'] <= 2025)]

if len(df_with_years) > 0:
    # Publication trends over time
    yearly_counts = df_with_years.groupby('year').size().reset_index(name='count')
    
    # Create interactive time series plot
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Publications Over Time', 'QAC Mentions Over Time', 
                       'Average Authors per Paper', 'Cumulative Publications'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Publications over time
    fig.add_trace(
        go.Scatter(x=yearly_counts['year'], y=yearly_counts['count'],
                  mode='lines+markers', name='Publications',
                  line=dict(color='blue', width=3)),
        row=1, col=1
    )
    
    # QAC mentions over time
    if len(df_qac_mentions) > 0:
        qac_yearly = df_qac_mentions.dropna(subset=['year']).groupby('year').size().reset_index(name='mentions')
        fig.add_trace(
            go.Scatter(x=qac_yearly['year'], y=qac_yearly['mentions'],
                      mode='lines+markers', name='QAC Mentions',
                      line=dict(color='red', width=3)),
            row=1, col=2
        )
    
    # Average authors per paper
    author_trends = df_with_years.groupby('year')['author_count'].mean().reset_index()
    fig.add_trace(
        go.Scatter(x=author_trends['year'], y=author_trends['author_count'],
                  mode='lines+markers', name='Avg Authors',
                  line=dict(color='green', width=3)),
        row=2, col=1
    )
    
    # Cumulative publications
    yearly_counts['cumulative'] = yearly_counts['count'].cumsum()
    fig.add_trace(
        go.Scatter(x=yearly_counts['year'], y=yearly_counts['cumulative'],
                  mode='lines+markers', name='Cumulative',
                  line=dict(color='purple', width=3)),
        row=2, col=2
    )
    
    fig.update_layout(height=800, title_text="📊 QAC Research Temporal Analysis", showlegend=False)
    fig.show()
    
    # Growth analysis
    recent_years = yearly_counts[yearly_counts['year'] >= 2010]
    if len(recent_years) > 5:
        growth_rate = np.polyfit(recent_years['year'], recent_years['count'], 1)[0]
        print(f"📈 Publication growth rate (2010+): {growth_rate:.2f} papers/year")
    
    # Peak years
    top_years = yearly_counts.nlargest(5, 'count')
    print(f"\n🔥 Top 5 publication years:")
    for _, row in top_years.iterrows():
        print(f"   {int(row['year'])}: {row['count']} papers")
        
else:
    print("⚠️  No temporal data available for analysis")

In [None]:
# 💊 QAC Compound Analysis & Chemical Profiling

print("🧪 Analyzing QAC compound mentions...")

if len(df_qac_mentions) > 0:
    # Most frequently mentioned compounds
    compound_counts = df_qac_mentions['compound'].value_counts().head(20)
    
    # Create compound analysis visualization
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Top QAC Compounds', 'Compound Trends Over Time', 
                       'Journal Distribution', 'Compound Categories'),
        specs=[[{"type": "bar"}, {"type": "scatter"}],
               [{"type": "pie"}, {"type": "bar"}]]
    )
    
    # Top compounds bar chart
    fig.add_trace(
        go.Bar(x=compound_counts.head(10).values, y=compound_counts.head(10).index,
               orientation='h', name='Mentions',
               marker_color='lightblue'),
        row=1, col=1
    )
    
    # Compound trends over time (top 5 compounds)
    top_compounds = compound_counts.head(5).index
    for i, compound in enumerate(top_compounds):
        compound_data = df_qac_mentions[df_qac_mentions['compound'] == compound]
        if 'year' in compound_data.columns:
            yearly_data = compound_data.dropna(subset=['year']).groupby('year').size().reset_index(name='count')
            if len(yearly_data) > 0:
                fig.add_trace(
                    go.Scatter(x=yearly_data['year'], y=yearly_data['count'],
                              mode='lines+markers', name=compound[:20]),
                    row=1, col=2
                )
    
    # Journal distribution for QAC research
    journal_qac_counts = df_qac_mentions['journal'].value_counts().head(10)
    if len(journal_qac_counts) > 0:
        fig.add_trace(
            go.Pie(labels=journal_qac_counts.index, values=journal_qac_counts.values,
                   name="Journals"),
            row=2, col=1
        )
    
    # Categorize compounds
    categories = {
        'Antimicrobial': ['benzalkonium', 'cetylpyridinium', 'benzethonium', 'didecyldimethylammonium'],
        'Industrial': ['polyquaternium', 'diallyldimethylammonium', 'quaternium'],
        'Research_Tools': ['tetramethylammonium', 'tetraethylammonium', 'tetrabutylammonium'],
        'Herbicides': ['paraquat', 'diquat'],
        'Dyes': ['malachite green', 'brilliant green', 'basic blue']
    }
    
    category_counts = defaultdict(int)
    for compound in df_qac_mentions['compound']:
        compound_lower = compound.lower()
        categorized = False
        for category, keywords in categories.items():
            if any(keyword in compound_lower for keyword in keywords):
                category_counts[category] += 1
                categorized = True
                break
        if not categorized:
            category_counts['Other'] += 1
    
    if category_counts:
        fig.add_trace(
            go.Bar(x=list(category_counts.keys()), y=list(category_counts.values()),
                   name='Categories', marker_color='orange'),
            row=2, col=2
        )
    
    fig.update_layout(height=1000, title_text="💊 QAC Compound Analysis Dashboard")
    fig.show()
    
    # Print detailed statistics
    print(f"\n📊 Compound Statistics:")
    print(f"   Total unique compounds: {df_qac_mentions['compound'].nunique()}")
    print(f"   Total mentions: {len(df_qac_mentions)}")
    print(f"   Average mentions per compound: {len(df_qac_mentions) / df_qac_mentions['compound'].nunique():.1f}")
    
    print(f"\n🔝 Top 10 compounds:")
    for i, (compound, count) in enumerate(compound_counts.head(10).items(), 1):
        print(f"   {i:2d}. {compound}: {count} mentions")
    
else:
    print("⚠️  No QAC mention data available for analysis")

In [None]:
# 📰 Journal & Publication Venue Analysis

print("📚 Analyzing publication venues and journal patterns...")

if len(df_enhanced) > 0:
    # Clean and analyze journals
    df_journals_clean = df_enhanced[df_enhanced['journal'].notna() & (df_enhanced['journal'] != '')].copy()
    
    if len(df_journals_clean) > 0:
        journal_stats = df_journals_clean.groupby('journal').agg({
            'pmid': 'count',
            'year': ['min', 'max', 'mean'],
            'author_count': 'mean',
            'qac_mention_count': 'sum'
        }).round(2)
        
        journal_stats.columns = ['articles', 'first_year', 'last_year', 'avg_year', 'avg_authors', 'total_qac_mentions']
        journal_stats = journal_stats.reset_index().sort_values('articles', ascending=False)
        
        # Create journal analysis dashboard
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Top Journals by Article Count', 'Journal Timeline', 
                           'Journal Impact (QAC Focus)', 'Authorship Patterns'),
            specs=[[{"type": "bar"}, {"type": "scatter"}],
                   [{"type": "scatter"}, {"type": "box"}]]
        )
        
        # Top journals
        top_journals = journal_stats.head(15)
        fig.add_trace(
            go.Bar(y=top_journals['journal'], x=top_journals['articles'],
                   orientation='h', name='Articles',
                   marker_color='steelblue'),
            row=1, col=1
        )
        
        # Journal timeline (when journals started/ended publishing QAC research)
        for i, journal in enumerate(top_journals.head(10)['journal']):
            journal_data = journal_stats[journal_stats['journal'] == journal].iloc[0]
            fig.add_trace(
                go.Scatter(x=[journal_data['first_year'], journal_data['last_year']], 
                          y=[journal[:30], journal[:30]],
                          mode='lines+markers', 
                          name=journal[:20],
                          line=dict(width=4)),
                row=1, col=2
            )
        
        # QAC focus vs article count
        fig.add_trace(
            go.Scatter(x=journal_stats['articles'], y=journal_stats['total_qac_mentions'],
                      mode='markers', name='Journals',
                      text=journal_stats['journal'],
                      hovertemplate='%{text}<br>Articles: %{x}<br>QAC Mentions: %{y}',
                      marker=dict(size=8, opacity=0.6)),
            row=2, col=1
        )
        
        # Author patterns by journal type
        journal_types = []
        for journal in df_journals_clean['journal']:
            journal_lower = journal.lower()
            if any(term in journal_lower for term in ['toxicol', 'environ', 'ecotoxicol']):
                journal_types.append('Environmental/Toxicology')
            elif any(term in journal_lower for term in ['chem', 'analyt', 'chromatogr']):
                journal_types.append('Chemistry/Analytical')
            elif any(term in journal_lower for term in ['microbiol', 'antimicrob', 'infect']):
                journal_types.append('Microbiology/Infectious')
            elif any(term in journal_lower for term in ['food', 'safet']):
                journal_types.append('Food Safety')
            else:
                journal_types.append('Other')
        
        df_journals_clean['journal_type'] = journal_types
        
        for jtype in df_journals_clean['journal_type'].unique():
            type_data = df_journals_clean[df_journals_clean['journal_type'] == jtype]['author_count']
            fig.add_trace(
                go.Box(y=type_data, name=jtype),
                row=2, col=2
            )
        
        fig.update_layout(height=1000, title_text="📰 Journal & Publication Analysis Dashboard")
        fig.show()
        
        # Journal insights
        print(f"\\n📈 Journal Statistics:")
        print(f"   Total journals: {len(journal_stats)}")
        print(f"   Journals with 10+ articles: {len(journal_stats[journal_stats['articles'] >= 10])}")
        print(f"   Most productive journal: {journal_stats.iloc[0]['journal']} ({journal_stats.iloc[0]['articles']} articles)\")\n        
        print(f\"\\n🏆 Top 10 journals:\")
        for i, (_, row) in enumerate(journal_stats.head(10).iterrows(), 1):
            print(f\"   {i:2d}. {row['journal'][:50]}: {row['articles']} articles ({row['first_year']:.0f}-{row['last_year']:.0f})\")
        
        # Journal type analysis
        type_stats = df_journals_clean.groupby('journal_type').agg({
            'pmid': 'count',
            'author_count': 'mean',
            'qac_mention_count': 'mean'
        }).round(2)
        
        print(f\"\\n📚 Journal type analysis:\")
        for jtype, stats in type_stats.iterrows():
            print(f\"   {jtype}: {stats['pmid']} articles, avg {stats['author_count']} authors, {stats['qac_mention_count']} QAC mentions\")
            
    else:
        print(\"⚠️  No journal data available for analysis\")
else:
    print(\"⚠️  No article data available for analysis\")

In [None]:
# 🔍 Text Mining & Topic Modeling

print("📝 Performing advanced text analysis...")

if len(df_enhanced) > 0:
    # Combine title and abstract for analysis
    df_text = df_enhanced.copy()
    df_text['full_text'] = (df_text['title'].fillna('') + ' ' + df_text['abstract'].fillna('')).str.strip()
    df_text = df_text[df_text['full_text'].str.len() > 10]  # Filter out empty texts
    
    if len(df_text) > 0:
        print(f\"Analyzing {len(df_text)} articles with sufficient text...\")\n        
        # Word cloud generation
        all_text = ' '.join(df_text['full_text'])
        
        # Create word cloud
        wordcloud = WordCloud(width=800, height=400, 
                             background_color='white',
                             max_words=100,
                             colormap='viridis',
                             stopwords={'the', 'and', 'or', 'of', 'in', 'to', 'for', 'with', 'by', 'from', 'at', 'on', 'an', 'as', 'are', 'was', 'were', 'been', 'is', 'this', 'that', 'these', 'those', 'study', 'studies', 'using', 'used', 'analysis', 'method', 'methods', 'results', 'conclusion', 'conclusions'}).generate(all_text)
        
        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('🌟 Word Cloud of QAC Research Literature', size=16, pad=20)
        plt.tight_layout()
        plt.show()
        
        # TF-IDF Analysis for key terms
        print(\"\\n🔬 Extracting key terms using TF-IDF...\")
        vectorizer = TfidfVectorizer(max_features=1000, 
                                   stop_words='english',
                                   ngram_range=(1, 3),
                                   min_df=2,
                                   max_df=0.8)
        
        try:
            tfidf_matrix = vectorizer.fit_transform(df_text['full_text'])
            feature_names = vectorizer.get_feature_names_out()
            
            # Get top terms by TF-IDF score
            mean_scores = np.mean(tfidf_matrix.toarray(), axis=0)
            top_indices = mean_scores.argsort()[::-1][:30]
            top_terms = [(feature_names[i], mean_scores[i]) for i in top_indices]
            
            # Visualize top terms
            terms, scores = zip(*top_terms[:20])
            
            fig = go.Figure(data=go.Bar(
                x=list(scores),
                y=list(terms),
                orientation='h',
                marker_color='lightcoral'
            ))
            fig.update_layout(
                title=\"🎯 Top 20 Terms by TF-IDF Score\",
                xaxis_title=\"TF-IDF Score\",
                height=600
            )
            fig.show()
            
            print(f\"\\n🔝 Top terms in QAC literature:\")
            for i, (term, score) in enumerate(top_terms[:15], 1):
                print(f\"   {i:2d}. {term}: {score:.4f}\")
                
        except Exception as e:
            print(f\"TF-IDF analysis failed: {e}\")
        
        # Topic Modeling with LDA
        if len(df_text) >= 10:  # Need sufficient documents for LDA
            print(f\"\\n🎭 Performing topic modeling...\")
            try:
                # Use CountVectorizer for LDA
                count_vectorizer = CountVectorizer(max_features=500,
                                                 stop_words='english',
                                                 min_df=2,
                                                 max_df=0.8)
                count_matrix = count_vectorizer.fit_transform(df_text['full_text'])
                
                # Fit LDA model
                n_topics = min(8, len(df_text) // 5)  # Adaptive number of topics
                lda = LatentDirichletAllocation(n_components=n_topics, 
                                              random_state=42,
                                              max_iter=10)
                lda.fit(count_matrix)
                
                # Extract and display topics
                feature_names = count_vectorizer.get_feature_names_out()
                
                topics_data = []
                for topic_idx, topic in enumerate(lda.components_):
                    top_words_idx = topic.argsort()[::-1][:10]
                    top_words = [feature_names[i] for i in top_words_idx]
                    topics_data.append({
                        'topic': f'Topic {topic_idx + 1}',
                        'words': ', '.join(top_words),
                        'weight': topic[top_words_idx[0]]
                    })
                
                topics_df = pd.DataFrame(topics_data)
                
                print(f\"\\n🎭 Discovered {n_topics} topics:\")
                for i, row in topics_df.iterrows():
                    print(f\"   {row['topic']}: {row['words']}\")
                
                # Visualize topic weights
                fig = go.Figure(data=go.Bar(
                    x=topics_df['topic'],
                    y=topics_df['weight'],
                    marker_color='skyblue'
                ))
                fig.update_layout(
                    title=\"📊 Topic Weights in QAC Literature\",
                    xaxis_title=\"Topics\",
                    yaxis_title=\"Weight\"
                )
                fig.show()
                
            except Exception as e:
                print(f\"Topic modeling failed: {e}\")
        
        # Sentiment Analysis
        print(f\"\\n😊 Analyzing sentiment...\")
        sentiments = []
        for text in df_text['full_text'].head(100):  # Sample for speed
            try:
                blob = TextBlob(text)
                sentiments.append({
                    'polarity': blob.sentiment.polarity,
                    'subjectivity': blob.sentiment.subjectivity
                })
            except:
                sentiments.append({'polarity': 0, 'subjectivity': 0})
        
        if sentiments:
            sentiment_df = pd.DataFrame(sentiments)
            
            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x=sentiment_df['polarity'],
                y=sentiment_df['subjectivity'],
                mode='markers',
                marker=dict(
                    size=8,
                    color=sentiment_df['polarity'],
                    colorscale='RdYlBu',
                    showscale=True,
                    colorbar=dict(title=\"Polarity\")
                ),
                text=[f'Polarity: {p:.2f}<br>Subjectivity: {s:.2f}' 
                      for p, s in zip(sentiment_df['polarity'], sentiment_df['subjectivity'])],
                hovertemplate='%{text}<extra></extra>'
            ))
            
            fig.update_layout(
                title=\"🎭 Sentiment Analysis of QAC Literature\",
                xaxis_title=\"Polarity (Negative ← → Positive)\",
                yaxis_title=\"Subjectivity (Objective ← → Subjective)\",
                width=700, height=500
            )
            fig.show()
            
            avg_polarity = sentiment_df['polarity'].mean()
            avg_subjectivity = sentiment_df['subjectivity'].mean()
            print(f\"   Average polarity: {avg_polarity:.3f} (neutral: 0, positive: >0, negative: <0)\")
            print(f\"   Average subjectivity: {avg_subjectivity:.3f} (objective: 0, subjective: 1)\")
            
    else:
        print(\"⚠️  No sufficient text data available for analysis\")
else:
    print(\"⚠️  No article data available for text analysis\")

In [None]:
# 🕸️ Research Collaboration Network Analysis

print("🤝 Analyzing research collaboration networks...")

if len(df_authors) > 0:
    # Author collaboration analysis
    author_stats = df_authors.groupby('author').agg({
        'pmid': 'nunique',
        'year': ['min', 'max']
    }).round(0)
    author_stats.columns = ['papers', 'first_year', 'last_year']
    author_stats = author_stats.reset_index().sort_values('papers', ascending=False)
    
    # Create author network
    print(f\"Building collaboration network from {len(df_authors)} author entries...\")
    
    # Group by paper to find collaborations
    paper_authors = df_authors.groupby('pmid')['author'].apply(list).reset_index()
    paper_authors = paper_authors[paper_authors['author'].apply(len) > 1]  # Only multi-author papers
    
    if len(paper_authors) > 0:
        # Build collaboration network
        G = nx.Graph()
        collaboration_count = defaultdict(int)
        
        for _, row in paper_authors.iterrows():
            authors = row['author']
            # Add edges between all pairs of authors on the same paper
            for i in range(len(authors)):
                for j in range(i+1, len(authors)):
                    author1, author2 = sorted([authors[i], authors[j]])
                    G.add_edge(author1, author2)
                    collaboration_count[(author1, author2)] += 1
        
        if len(G.nodes()) > 0:
            print(f\"Network stats: {len(G.nodes())} authors, {len(G.edges())} collaborations\")
            
            # Network metrics
            try:
                # Calculate centrality measures for top authors
                top_authors = author_stats.head(50)['author'].tolist()
                subgraph = G.subgraph([a for a in top_authors if a in G.nodes()])
                
                if len(subgraph.nodes()) > 0:
                    degree_centrality = nx.degree_centrality(subgraph)
                    betweenness_centrality = nx.betweenness_centrality(subgraph)
                    closeness_centrality = nx.closeness_centrality(subgraph)
                    
                    # Create network analysis dataframe
                    network_stats = []
                    for author in subgraph.nodes():
                        network_stats.append({
                            'author': author,
                            'degree_centrality': degree_centrality.get(author, 0),
                            'betweenness_centrality': betweenness_centrality.get(author, 0),
                            'closeness_centrality': closeness_centrality.get(author, 0),
                            'papers': author_stats[author_stats['author'] == author]['papers'].iloc[0] if not author_stats[author_stats['author'] == author].empty else 0
                        })
                    
                    network_df = pd.DataFrame(network_stats).sort_values('degree_centrality', ascending=False)
                    
                    # Visualize network metrics
                    fig = make_subplots(
                        rows=2, cols=2,
                        subplot_titles=('Author Productivity', 'Degree Centrality', 
                                       'Betweenness Centrality', 'Network Overview'),
                        specs=[[{\"type\": \"bar\"}, {\"type\": \"bar\"}],
                               [{\"type\": \"bar\"}, {\"type\": \"scatter\"}]]
                    )
                    
                    # Author productivity
                    top_productive = author_stats.head(15)
                    fig.add_trace(
                        go.Bar(y=top_productive['author'], x=top_productive['papers'],
                               orientation='h', name='Papers',
                               marker_color='lightblue'),
                        row=1, col=1
                    )
                    
                    # Degree centrality
                    top_central = network_df.head(15)
                    fig.add_trace(
                        go.Bar(y=top_central['author'], x=top_central['degree_centrality'],
                               orientation='h', name='Centrality',
                               marker_color='lightcoral'),
                        row=1, col=2
                    )
                    
                    # Betweenness centrality
                    top_between = network_df.nlargest(15, 'betweenness_centrality')
                    fig.add_trace(
                        go.Bar(y=top_between['author'], x=top_between['betweenness_centrality'],
                               orientation='h', name='Betweenness',
                               marker_color='lightgreen'),
                        row=2, col=1
                    )
                    
                    # Network overview scatter
                    fig.add_trace(
                        go.Scatter(x=network_df['papers'], y=network_df['degree_centrality'],
                                  mode='markers', name='Authors',
                                  text=network_df['author'],
                                  hovertemplate='%{text}<br>Papers: %{x}<br>Centrality: %{y:.3f}',
                                  marker=dict(size=8, opacity=0.6, color='purple')),
                        row=2, col=2
                    )
                    
                    fig.update_layout(height=1000, title_text=\"🕸️ Research Collaboration Network Analysis\")
                    fig.show()
                    
                    # Print network insights
                    print(f\"\\n🌟 Top collaborative authors (by degree centrality):\")
                    for i, (_, row) in enumerate(network_df.head(10).iterrows(), 1):
                        print(f\"   {i:2d}. {row['author']}: {row['degree_centrality']:.3f} centrality, {row['papers']} papers\")
                    
                    print(f\"\\n🌉 Key bridge authors (by betweenness centrality):\")
                    bridge_authors = network_df.nlargest(5, 'betweenness_centrality')
                    for i, (_, row) in enumerate(bridge_authors.iterrows(), 1):
                        print(f\"   {i}. {row['author']}: {row['betweenness_centrality']:.3f} betweenness\")
                    
                    # Collaboration strength analysis
                    strong_collaborations = [(pair, count) for pair, count in collaboration_count.items() if count >= 3]
                    if strong_collaborations:
                        strong_collaborations.sort(key=lambda x: x[1], reverse=True)
                        print(f\"\\n🤝 Strongest collaborations (3+ papers):\")
                        for (author1, author2), count in strong_collaborations[:10]:
                            print(f\"   {author1} ↔ {author2}: {count} papers\")
                    
                except Exception as e:
                    print(f\"Network analysis error: {e}\")
            
        else:
            print(\"⚠️  Unable to build collaboration network\")
    else:
        print(\"⚠️  No multi-author papers found\")
    
    # Temporal author analysis
    if 'year' in df_authors.columns:
        author_years = df_authors.dropna(subset=['year'])
        if len(author_years) > 0:
            # Authors over time
            yearly_authors = author_years.groupby('year')['author'].nunique().reset_index()
            yearly_authors.columns = ['year', 'unique_authors']
            
            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x=yearly_authors['year'], 
                y=yearly_authors['unique_authors'],
                mode='lines+markers',
                name='Unique Authors',
                line=dict(color='darkorange', width=3)
            ))
            
            fig.update_layout(
                title=\"👥 Research Community Growth Over Time\",
                xaxis_title=\"Year\",
                yaxis_title=\"Number of Unique Authors\",
                height=400
            )
            fig.show()
            
            print(f\"\\n📈 Author community growth:\")
            recent_growth = yearly_authors[yearly_authors['year'] >= 2010]
            if len(recent_growth) > 1:
                growth_rate = (recent_growth['unique_authors'].iloc[-1] - recent_growth['unique_authors'].iloc[0]) / len(recent_growth)
                print(f\"   Average new authors per year (2010+): {growth_rate:.1f}\")
            
else:
    print(\"⚠️  No author data available for network analysis\")

In [None]:
# 📊 Research Impact & Quality Analysis

print("🎯 Analyzing research impact and quality indicators...")

if len(df_enhanced) > 0:
    # Research quality indicators
    quality_metrics = []
    
    for _, article in df_enhanced.iterrows():
        metrics = {
            'pmid': article['pmid'],
            'title_length': len(str(article['title'])) if pd.notna(article['title']) else 0,
            'abstract_length': len(str(article['abstract'])) if pd.notna(article['abstract']) else 0,
            'has_abstract': pd.notna(article['abstract']) and len(str(article['abstract'])) > 10,
            'author_count': article['author_count'] if pd.notna(article['author_count']) else 0,
            'keyword_count': len(str(article['keywords']).split(';')) if pd.notna(article['keywords']) else 0,
            'mesh_count': len(str(article['mesh_terms']).split(';')) if pd.notna(article['mesh_terms']) else 0,
            'qac_mentions': article['qac_mention_count'] if pd.notna(article['qac_mention_count']) else 0,
            'year': article['year']
        }
        
        # Calculate quality score (0-100)
        quality_score = 0
        if metrics['has_abstract']: quality_score += 25
        if metrics['author_count'] >= 3: quality_score += 15
        if metrics['keyword_count'] >= 3: quality_score += 15
        if metrics['mesh_count'] >= 3: quality_score += 15
        if metrics['abstract_length'] >= 500: quality_score += 10
        if metrics['qac_mentions'] >= 2: quality_score += 10
        if metrics['title_length'] >= 50: quality_score += 10
        
        metrics['quality_score'] = quality_score
        quality_metrics.append(metrics)
    
    quality_df = pd.DataFrame(quality_metrics)
    
    # Create quality analysis dashboard
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=('Quality Score Distribution', 'Quality vs. Author Count',
                       'Abstract Length Distribution', 'Quality Trends Over Time',
                       'QAC Mentions vs. Quality', 'Research Completeness'),
        specs=[[{\"type\": \"histogram\"}, {\"type\": \"scatter\"}],
               [{\"type\": \"histogram\"}, {\"type\": \"scatter\"}],
               [{\"type\": \"scatter\"}, {\"type\": \"bar\"}]]
    )
    
    # Quality score distribution
    fig.add_trace(
        go.Histogram(x=quality_df['quality_score'], nbinsx=20,
                    name='Quality Scores', marker_color='skyblue'),
        row=1, col=1
    )
    
    # Quality vs author count
    fig.add_trace(
        go.Scatter(x=quality_df['author_count'], y=quality_df['quality_score'],
                  mode='markers', name='Articles',
                  marker=dict(size=6, opacity=0.6, color='coral')),
        row=1, col=2
    )
    
    # Abstract length distribution
    fig.add_trace(
        go.Histogram(x=quality_df['abstract_length'], nbinsx=30,
                    name='Abstract Lengths', marker_color='lightgreen'),
        row=2, col=1
    )
    
    # Quality trends over time
    if quality_df['year'].notna().any():
        yearly_quality = quality_df.dropna(subset=['year']).groupby('year')['quality_score'].mean().reset_index()
        fig.add_trace(
            go.Scatter(x=yearly_quality['year'], y=yearly_quality['quality_score'],
                      mode='lines+markers', name='Avg Quality',
                      line=dict(color='purple', width=3)),
            row=2, col=2
        )
    
    # QAC mentions vs quality
    fig.add_trace(
        go.Scatter(x=quality_df['qac_mentions'], y=quality_df['quality_score'],
                  mode='markers', name='QAC Focus',
                  marker=dict(size=6, opacity=0.6, color='orange')),
        row=3, col=1
    )
    
    # Research completeness
    completeness_metrics = {
        'Has Abstract': (quality_df['has_abstract'].sum() / len(quality_df)) * 100,
        'Multi-Author': (quality_df['author_count'] >= 2).sum() / len(quality_df) * 100,
        'Has Keywords': (quality_df['keyword_count'] > 0).sum() / len(quality_df) * 100,
        'Has MeSH': (quality_df['mesh_count'] > 0).sum() / len(quality_df) * 100,
        'QAC Focused': (quality_df['qac_mentions'] > 0).sum() / len(quality_df) * 100
    }
    
    fig.add_trace(
        go.Bar(x=list(completeness_metrics.keys()), y=list(completeness_metrics.values()),
               name='Completeness %', marker_color='gold'),
        row=3, col=2
    )
    
    fig.update_layout(height=1200, title_text=\"📊 Research Quality & Impact Analysis\")
    fig.show()
    
    # Quality statistics
    print(f\"\\n📈 Quality Analysis Results:\")
    print(f\"   Average quality score: {quality_df['quality_score'].mean():.1f}/100\")
    print(f\"   High quality articles (>70): {(quality_df['quality_score'] > 70).sum()} ({(quality_df['quality_score'] > 70).mean()*100:.1f}%)\")
    print(f\"   Articles with abstracts: {quality_df['has_abstract'].sum()} ({quality_df['has_abstract'].mean()*100:.1f}%)\")
    print(f\"   Multi-author articles: {(quality_df['author_count'] >= 2).sum()} ({(quality_df['author_count'] >= 2).mean()*100:.1f}%)\")
    
    # Top quality articles
    top_quality = quality_df.nlargest(10, 'quality_score')
    if len(top_quality) > 0:
        print(f\"\\n🏆 Top 10 highest quality articles:\")
        for i, (_, row) in enumerate(top_quality.iterrows(), 1):
            pmid = row['pmid']
            score = row['quality_score']
            article_title = df_enhanced[df_enhanced['pmid'] == pmid]['title'].iloc[0] if not df_enhanced[df_enhanced['pmid'] == pmid].empty else \"N/A\"
            print(f\"   {i:2d}. PMID {pmid} (Score: {score}): {str(article_title)[:60]}...\")
    
    # Research trends analysis
    if quality_df['year'].notna().any():
        recent_years = quality_df[quality_df['year'] >= 2015].copy()
        older_years = quality_df[quality_df['year'] < 2015].copy()
        
        if len(recent_years) > 0 and len(older_years) > 0:
            print(f\"\\n📊 Quality evolution:\")
            print(f\"   Pre-2015 average quality: {older_years['quality_score'].mean():.1f}\")
            print(f\"   2015+ average quality: {recent_years['quality_score'].mean():.1f}\")
            print(f\"   Author count evolution: {older_years['author_count'].mean():.1f} → {recent_years['author_count'].mean():.1f}\")
            print(f\"   Abstract length evolution: {older_years['abstract_length'].mean():.0f} → {recent_years['abstract_length'].mean():.0f} chars\")
    
    # Research focus analysis
    high_qac_focus = quality_df[quality_df['qac_mentions'] >= 3]
    if len(high_qac_focus) > 0:
        print(f\"\\n💊 High QAC-focused research ({len(high_qac_focus)} articles):\")
        print(f\"   Average quality score: {high_qac_focus['quality_score'].mean():.1f}\")
        print(f\"   Average author count: {high_qac_focus['author_count'].mean():.1f}\")
        print(f\"   Average abstract length: {high_qac_focus['abstract_length'].mean():.0f} chars\")
        
else:
    print(\"⚠️  No article data available for impact analysis\")

In [None]:
# 📋 Comprehensive Analysis Summary & Export

print("📊 Generating comprehensive analysis summary...")

# Create analysis summary
summary = {
    'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset_summary': {},
    'temporal_insights': {},
    'content_insights': {},
    'collaboration_insights': {},
    'quality_insights': {}
}

if len(df_enhanced) > 0:
    # Dataset summary
    summary['dataset_summary'] = {
        'total_articles': len(df_enhanced),
        'articles_with_abstracts': df_enhanced['abstract'].notna().sum(),
        'unique_journals': df_enhanced['journal'].nunique() if 'journal' in df_enhanced.columns else 0,
        'unique_authors': len(df_authors) if len(df_authors) > 0 else 0,
        'total_qac_mentions': len(df_qac_mentions) if len(df_qac_mentions) > 0 else 0,
        'year_range': f\"{df_enhanced['year'].min():.0f}-{df_enhanced['year'].max():.0f}\" if df_enhanced['year'].notna().any() else \"N/A\"
    }
    
    # Temporal insights
    if df_enhanced['year'].notna().any():
        yearly_data = df_enhanced.dropna(subset=['year'])
        summary['temporal_insights'] = {
            'most_productive_year': int(yearly_data.groupby('year').size().idxmax()),
            'articles_in_peak_year': int(yearly_data.groupby('year').size().max()),
            'articles_last_5_years': len(yearly_data[yearly_data['year'] >= 2020]),
            'growth_trend': 'increasing' if yearly_data[yearly_data['year'] >= 2015].groupby('year').size().is_monotonic_increasing else 'variable'
        }
    
    # Content insights
    if len(df_qac_mentions) > 0:
        top_compound = df_qac_mentions['compound'].value_counts().index[0]
        summary['content_insights'] = {
            'most_studied_compound': top_compound,
            'compound_mentions': int(df_qac_mentions['compound'].value_counts().iloc[0]),
            'unique_compounds': df_qac_mentions['compound'].nunique(),
            'avg_compounds_per_article': len(df_qac_mentions) / len(df_enhanced)
        }
    
    # Collaboration insights
    if len(df_authors) > 0:
        summary['collaboration_insights'] = {
            'most_prolific_author': df_authors.groupby('author').size().idxmax(),
            'author_paper_count': int(df_authors.groupby('author').size().max()),
            'avg_authors_per_paper': df_enhanced['author_count'].mean(),
            'single_author_papers': (df_enhanced['author_count'] == 1).sum()
        }
    
    # Quality insights
    if 'quality_df' in locals():
        summary['quality_insights'] = {
            'avg_quality_score': quality_df['quality_score'].mean(),
            'high_quality_articles': (quality_df['quality_score'] > 70).sum(),
            'articles_with_abstracts_pct': (quality_df['has_abstract'].mean() * 100),
            'avg_abstract_length': quality_df['abstract_length'].mean()
        }

# Print comprehensive summary
print(\"\\n\" + \"=\"*80)
print(\"🎯 COMPREHENSIVE QAC SYSTEMATIC REVIEW ANALYSIS SUMMARY\")
print(\"=\"*80)

print(f\"\\n📊 DATASET OVERVIEW\")
print(f\"   Analysis Date: {summary['analysis_date']}\")
for key, value in summary['dataset_summary'].items():
    print(f\"   {key.replace('_', ' ').title()}: {value}\")

if summary['temporal_insights']:
    print(f\"\\n📈 TEMPORAL PATTERNS\")
    for key, value in summary['temporal_insights'].items():
        print(f\"   {key.replace('_', ' ').title()}: {value}\")

if summary['content_insights']:
    print(f\"\\n💊 CONTENT ANALYSIS\")
    for key, value in summary['content_insights'].items():
        print(f\"   {key.replace('_', ' ').title()}: {value}\")

if summary['collaboration_insights']:
    print(f\"\\n🤝 COLLABORATION PATTERNS\")
    for key, value in summary['collaboration_insights'].items():
        print(f\"   {key.replace('_', ' ').title()}: {value}\")

if summary['quality_insights']:
    print(f\"\\n⭐ QUALITY METRICS\")
    for key, value in summary['quality_insights'].items():
        print(f\"   {key.replace('_', ' ').title()}: {value:.2f}\" if isinstance(value, float) else f\"   {key.replace('_', ' ').title()}: {value}\")

print(\"\\n\" + \"=\"*80)

# Export enhanced datasets
print(f\"\\n💾 Exporting enhanced datasets...\")

# Export main dataset
output_files = []
if len(df_enhanced) > 0:
    enhanced_filename = \"qac_articles_enhanced.csv\"
    df_enhanced.to_csv(enhanced_filename, index=False, encoding='utf-8')
    output_files.append(enhanced_filename)
    print(f\"✅ Exported enhanced articles: {enhanced_filename}\")

# Export QAC mentions
if len(df_qac_mentions) > 0:
    qac_filename = \"qac_compound_mentions.csv\"
    df_qac_mentions.to_csv(qac_filename, index=False, encoding='utf-8')
    output_files.append(qac_filename)
    print(f\"✅ Exported QAC mentions: {qac_filename}\")

# Export author data
if len(df_authors) > 0:
    authors_filename = \"qac_authors.csv\"
    df_authors.to_csv(authors_filename, index=False, encoding='utf-8')
    output_files.append(authors_filename)
    print(f\"✅ Exported author data: {authors_filename}\")

# Export journal analysis
if len(df_journals) > 0:
    journals_filename = \"qac_journals.csv\"
    df_journals.to_csv(journals_filename, index=False, encoding='utf-8')
    output_files.append(journals_filename)
    print(f\"✅ Exported journal data: {journals_filename}\")

# Export summary report
summary_filename = \"qac_analysis_summary.json\"
import json
with open(summary_filename, 'w', encoding='utf-8') as f:
    # Convert numpy types to Python native types for JSON serialization
    def convert_numpy(obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif pd.isna(obj):
            return None
        return obj
    
    # Deep convert the summary
    def deep_convert(item):
        if isinstance(item, dict):
            return {k: deep_convert(v) for k, v in item.items()}
        elif isinstance(item, list):
            return [deep_convert(v) for v in item]
        else:
            return convert_numpy(item)
    
    json.dump(deep_convert(summary), f, indent=2, default=str)

output_files.append(summary_filename)
print(f\"✅ Exported analysis summary: {summary_filename}\")

# Create analysis report
report_filename = \"QAC_Analysis_Report.md\"
with open(report_filename, 'w', encoding='utf-8') as f:
    f.write(\"# QAC Systematic Review Analysis Report\\n\\n\")
    f.write(f\"**Generated:** {summary['analysis_date']}\\n\\n\")
    
    f.write(\"## Executive Summary\\n\\n\")
    f.write(f\"This analysis examined **{summary['dataset_summary']['total_articles']} articles** related to Quaternary Ammonium Compounds (QACs) from the PubMed database. \")
    
    if summary['temporal_insights']:
        f.write(f\"The research spans from {summary['dataset_summary']['year_range']}, with peak activity in {summary['temporal_insights']['most_productive_year']} ({summary['temporal_insights']['articles_in_peak_year']} articles). \")
    
    if summary['content_insights']:
        f.write(f\"A total of {summary['content_insights']['total_qac_mentions']} QAC compound mentions were identified across {summary['content_insights']['unique_compounds']} unique compounds.\\n\\n\")
    
    f.write(\"## Key Findings\\n\\n\")
    
    if summary['temporal_insights']:
        f.write(\"### Temporal Trends\\n\")
        f.write(f\"- Research activity shows a **{summary['temporal_insights']['growth_trend']}** trend\\n\")
        f.write(f\"- Most productive year: **{summary['temporal_insights']['most_productive_year']}** ({summary['temporal_insights']['articles_in_peak_year']} articles)\\n\")
        f.write(f\"- Recent activity (2020+): **{summary['temporal_insights']['articles_last_5_years']} articles**\\n\\n\")
    
    if summary['content_insights']:
        f.write(\"### Research Focus\\n\")
        f.write(f\"- Most studied compound: **{summary['content_insights']['most_studied_compound']}** ({summary['content_insights']['compound_mentions']} mentions)\\n\")
        f.write(f\"- Average QAC mentions per article: **{summary['content_insights']['avg_compounds_per_article']:.2f}**\\n\\n\")
    
    if summary['collaboration_insights']:
        f.write(\"### Collaboration Patterns\\n\")
        f.write(f\"- Most prolific author: **{summary['collaboration_insights']['most_prolific_author']}** ({summary['collaboration_insights']['author_paper_count']} papers)\\n\")
        f.write(f\"- Average authors per paper: **{summary['collaboration_insights']['avg_authors_per_paper']:.1f}**\\n\")
        f.write(f\"- Single-author papers: **{summary['collaboration_insights']['single_author_papers']}**\\n\\n\")
    
    if summary['quality_insights']:
        f.write(\"### Research Quality\\n\")
        f.write(f\"- Average quality score: **{summary['quality_insights']['avg_quality_score']:.1f}/100**\\n\")
        f.write(f\"- High-quality articles (>70): **{summary['quality_insights']['high_quality_articles']}**\\n\")
        f.write(f\"- Articles with abstracts: **{summary['quality_insights']['articles_with_abstracts_pct']:.1f}%**\\n\\n\")
    
    f.write(\"## Data Files\\n\\n\")
    f.write(\"The following data files were generated:\\n\\n\")
    for file in output_files:
        f.write(f\"- `{file}`\\n\")
    
    f.write(\"\\n---\\n\")
    f.write(\"*Report generated using automated QAC systematic review analysis pipeline*\\n\")

output_files.append(report_filename)
print(f\"✅ Generated analysis report: {report_filename}\")

print(f\"\\n🎉 Analysis complete! Generated {len(output_files)} output files:\")
for file in output_files:
    print(f\"   📄 {file}\")

print(f\"\\n🚀 Your QAC systematic review analysis is ready for further investigation!\")