In [8]:
# analysis.ipynb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Load dataset
df = pd.read_csv("data/metadata.csv")

# Inspect
print(df.shape)
print(df.info())
print(df.isnull().sum().head())

# Clean: keep only important columns
df = df[['title', 'abstract', 'publish_time', 'journal', 'source_x']]

# Convert date
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year

# Drop rows with missing title or date
df = df.dropna(subset=['title', 'publish_time'])
print("After cleaning:", df.shape)

# Add abstract word count
df['abstract_wc'] = df['abstract'].fillna("").apply(lambda x: len(x.split()))


ModuleNotFoundError: No module named 'wordcloud'

In [None]:
# Publications by year
year_counts = df['year'].value_counts().sort_index()
plt.figure(figsize=(8,4))
sns.barplot(x=year_counts.index, y=year_counts.values, color="skyblue")
plt.title("Publications by Year")
plt.savefig("images/pubs_by_year.png")
plt.show()

# Top journals
top_journals = df['journal'].value_counts().head(10)
plt.figure(figsize=(8,4))
sns.barplot(y=top_journals.index, x=top_journals.values, palette="viridis")
plt.title("Top 10 Journals")
plt.savefig("images/top_journals.png")
plt.show()

# Word cloud of titles
text = " ".join(df['title'].dropna().tolist())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig("images/wordcloud.png")
plt.show()
