In [None]:
# Part 1: Load and Explore the Dataset
import pandas as pd

# Load the dataset (make sure metadata.csv is in the same folder)
try:
    df = pd.read_csv("metadata.csv")
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: metadata.csv not found. Please add the dataset file.")

# Inspect the first few rows
print(df.head())

# Dataset shape (rows, columns)
print("Shape:", df.shape)

# Info about columns
print(df.info())

# Missing values
print("Missing values:\n", df.isnull().sum().sort_values(ascending=False).head(20))


In [None]:
# Drop rows where title or publish_time is missing (critical info)
df_clean = df.dropna(subset=["title", "publish_time"]).copy()

# Convert publish_time to datetime
df_clean["publish_time"] = pd.to_datetime(df_clean["publish_time"], errors="coerce")

# Extract year for analysis
df_clean["year"] = df_clean["publish_time"].dt.year

# Create a new feature: abstract word count
df_clean["abstract_word_count"] = df_clean["abstract"].fillna("").apply(lambda x: len(x.split()))

print(df_clean[["title", "year", "abstract_word_count"]].head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# 1. Count papers per year
year_counts = df_clean["year"].value_counts().sort_index()
plt.figure(figsize=(8,5))
sns.barplot(x=year_counts.index, y=year_counts.values, palette="Blues_d")
plt.title("Publications by Year")
plt.xlabel("Year")
plt.ylabel("Number of Papers")
plt.show()

# 2. Top journals
top_journals = df_clean["journal"].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values, palette="Greens_d")
plt.title("Top 10 Journals Publishing COVID-19 Research")
plt.xlabel("Number of Papers")
plt.ylabel("Journal")
plt.show()

# 3. Word Cloud of titles
text = " ".join(df_clean["title"].dropna().astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in Titles")
plt.show()

# 4. Papers by Source
source_counts = df_clean["source_x"].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=source_counts.index, x=source_counts.values, palette="Oranges_d")
plt.title("Top Sources of Papers")
plt.xlabel("Number of Papers")
plt.ylabel("Source")
plt.show()
