# CORD-19 Data Analysis (Notebook)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Load data
df = pd.read_csv('metadata.csv')

# Inspect
print(df.shape)
print(df.info())
print(df.head())
print(df.isnull().sum())

# Clean
df = df[['title', 'abstract', 'publish_time', 'authors', 'journal', 'source_x']]
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year
df['abstract_word_count'] = df['abstract'].fillna('').apply(lambda x: len(x.split()))
df = df.dropna(subset=['title', 'publish_time'])

# Analysis
year_counts = df['year'].value_counts().sort_index()
year_counts.plot(kind='bar', title='Publications by Year')
plt.show()

top_journals = df['journal'].value_counts().head(10)
sns.barplot(y=top_journals.index, x=top_journals.values)
plt.title('Top Journals')
plt.show()

titles = ' '.join(df['title'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(titles)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Paper Titles')
plt.show()