In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Load dataset
df = pd.read_csv("data/metadata.csv")

# Basic exploration
print(df.shape)
print(df.dtypes)
print(df.isnull().sum())
print(df.describe())

# Data cleaning
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year
df['abstract_word_count'] = df['abstract'].fillna("").apply(lambda x: len(x.split()))

# Analysis
papers_per_year = df['year'].value_counts().sort_index()
top_journals = df['journal'].value_counts().head(10)

# Visualizations
plt.figure(figsize=(8,5))
sns.countplot(x='year', data=df)
plt.title("Publications per Year")
plt.show()

plt.figure(figsize=(8,5))
top_journals.plot(kind='bar')
plt.title("Top Journals")
plt.show()

# Word cloud
text = " ".join(df['title'].dropna())
wc = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
