# Advanced Analysis of Warp Drive Research Dataset

This notebook demonstrates advanced analysis techniques including:
- Citation network analysis
- Topic modeling
- Research trend analysis
- Cross-domain connections

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
dataset = load_dataset("GotThatData/warp-speed")
df = pd.DataFrame(dataset)

## Citation Network Analysis

In [None]:
# Create citation network
G = nx.DiGraph()

# Add nodes and edges
for paper in dataset:
    G.add_node(paper['id'], title=paper['title'])
    for ref in paper['references']:
        G.add_edge(paper['id'], ref)

# Calculate network metrics
in_degree = dict(G.in_degree())
pagerank = nx.pagerank(G)

# Plot top cited papers
top_cited = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)[:10]
papers, citations = zip(*top_cited)

plt.figure(figsize=(12, 6))
plt.bar(papers, citations)
plt.xticks(rotation=45, ha='right')
plt.title('Top 10 Most Cited Papers')
plt.tight_layout()
plt.show()

## Topic Modeling

In [None]:
# Prepare text data
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['abstract'])

# Train LDA model
n_topics = 10
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

# Print top words for each topic
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-10:-1]]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

## Research Trend Analysis

In [None]:
# Convert dates to datetime
df['date'] = pd.to_datetime(df['publication_date'])

# Group by date and category
trends = df.groupby([pd.Grouper(key='date', freq='M'), 'category']).size().unstack()

# Plot trends
plt.figure(figsize=(15, 8))
trends.plot(kind='line', marker='o')
plt.title('Research Trends by Category')
plt.xlabel('Date')
plt.ylabel('Number of Papers')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## Cross-Domain Connections

In [None]:
# Create co-occurrence matrix
categories = df['category'].unique()
cooccurrence = np.zeros((len(categories), len(categories)))

# Count papers that reference across categories
for i, cat1 in enumerate(categories):
    for j, cat2 in enumerate(categories):
        if i != j:
            papers1 = set(df[df['category'] == cat1]['id'])
            papers2 = set(df[df['category'] == cat2]['id'])
            references = 0
            for paper_id in papers1:
                paper = dataset[paper_id]
                refs = set(paper['references'])
                references += len(refs.intersection(papers2))
            cooccurrence[i, j] = references

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(cooccurrence, xticklabels=categories, yticklabels=categories,
            annot=True, fmt='d', cmap='YlOrRd')
plt.title('Cross-Domain Citations')
plt.tight_layout()
plt.show()