# CORD-19 Research Papers - Data Analysis

This notebook explores the CORD-19 metadata dataset, performs basic cleaning, and generates simple visualizations.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

## Load Data

In [2]:
df = pd.read_csv("metadata.csv")
df.head()

## Explore Data

In [3]:
print(df.shape)
print(df.info())
df.isnull().sum().sort_values(ascending=False).head(10)

## Data Cleaning

In [4]:
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year
df['abstract_word_count'] = df['abstract'].fillna("").apply(lambda x: len(x.split()))
df_clean = df.dropna(subset=['title','publish_time'])

## Publications by Year

In [5]:
year_counts = df_clean['year'].value_counts().sort_index()
plt.bar(year_counts.index, year_counts.values)
plt.title("Publications by Year")
plt.xlabel("Year")
plt.ylabel("Number of Papers")
plt.show()

## Top Journals

In [6]:
top_journals = df_clean['journal'].value_counts().head(10)
sns.barplot(y=top_journals.index, x=top_journals.values)
plt.title("Top Journals Publishing COVID-19 Papers")
plt.show()

## Word Cloud of Titles

In [7]:
titles = " ".join(df_clean['title'].dropna().tolist())
wc = WordCloud(width=800, height=400, background_color="white").generate(titles)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()