# Basic Usage of Warp Drive Research Dataset

This notebook demonstrates basic loading and exploration of the dataset.

In [None]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
dataset = load_dataset("GotThatData/warp-speed")

## Dataset Overview

In [None]:
# Basic information
print(f"Dataset size: {len(dataset)}")
print(f"Features: {dataset.features}")

# Show first few entries
dataset[:5]

## Category Distribution

In [None]:
# Convert to pandas for easier analysis
df = pd.DataFrame(dataset)

# Plot category distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=df, y='category', order=df['category'].value_counts().index)
plt.title('Papers by Category')
plt.show()

## Basic Filtering

In [None]:
# Filter papers by category
physics_papers = dataset.filter(lambda x: x['category'] == 'physics')
print(f"Number of physics papers: {len(physics_papers)}")

# Filter by keyword in abstract
warp_papers = dataset.filter(lambda x: 'warp' in x['abstract'].lower())
print(f"Number of papers mentioning 'warp': {len(warp_papers)}")

## Text Analysis

In [None]:
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Function to get keywords from text
def get_keywords(text):
    words = nltk.word_tokenize(text.lower())
    stops = set(stopwords.words('english'))
    keywords = [w for w in words if w.isalnum() and w not in stops]
    return keywords

# Analyze keywords in abstracts
all_keywords = []
for paper in dataset:
    all_keywords.extend(get_keywords(paper['abstract']))

# Plot top keywords
keyword_freq = Counter(all_keywords).most_common(20)
keywords, counts = zip(*keyword_freq)

plt.figure(figsize=(12, 6))
plt.bar(keywords, counts)
plt.xticks(rotation=45, ha='right')
plt.title('Top 20 Keywords in Abstracts')
plt.tight_layout()
plt.show()