# 📊 CORD-19 Data Analysis

This notebook demonstrates data loading, cleaning, exploration, and visualization using the **CORD-19 metadata.csv** file.

In [None]:
# Part 1: Data Loading and Basic Exploration
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter

# Load dataset
df = pd.read_csv("metadata.csv", low_memory=False)

# Basic info
print("Shape:", df.shape)
print(df.info())
df.head()

In [None]:
# Check missing values
df.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
# Part 2: Data Cleaning and Preparation
df['publish_time'] = pd.to_datetime(df['publish_time'], errors="coerce")
df['year'] = df['publish_time'].dt.year
df['abstract_word_count'] = df['abstract'].fillna("").apply(lambda x: len(x.split()))
df[['publish_time', 'year', 'abstract_word_count']].head()

In [None]:
# Part 3: Analysis - Publications by Year
year_counts = df['year'].value_counts().sort_index()
plt.figure(figsize=(10,5))
sns.barplot(x=year_counts.index, y=year_counts.values, color="skyblue")
plt.title("Publications by Year")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Top Journals
top_journals = df['journal'].value_counts().head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_journals.values, y=top_journals.index, color="lightgreen")
plt.title("Top Journals")
plt.show()

In [None]:
# Word frequency in titles
titles = df['title'].dropna().astype(str)
words = " ".join(titles).lower().split()
common_words = Counter(words).most_common(20)
common_words

In [None]:
# Word Cloud of paper titles
text = " ".join(titles)
wc = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.figure(figsize=(12,6))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()