In [None]:
# Part 1: 
# Load and Explore Data
import pandas as pd

#Load data
df = pd.read_csv("metadata.csv", low_memory=False)
print("Shape:", df.shape)
print(df.head())
print(df.info())
print(df.isnull().sum().head(20))


This section loads data from the .csv file, explores it and displays the first 20 rows.

In [None]:
# Part 2: Clean & Prepare
# Drop rows with missing critical info
df_clean = df.dropna(subset=["title", "abstract", "publish_time"])

# Convert publish_time to datetime
df_clean["publish_time"] = pd.to_datetime(df_clean["publish_time"], errors="coerce")
df_clean["year"] = df_clean["publish_time"].dt.year

# Feature: abstract word count
df_clean["abstract_word_count"] = df_clean["abstract"].fillna("").apply(lambda x: len(x.split()))

df_clean.head()

This section focuses on data preparation and cleaning, in order to ascertain data integrity and accuracy.

In [None]:
# Analysing and Visualizing

import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

# Publications per year
df_clean["year"].value_counts().sort_index().plot(kind="bar", figsize=(10,5))
plt.title("Publications per Year")
plt.show()

# Top journals
df_clean["journal"].value_counts().head(10).plot(kind="barh", figsize=(8,5))
plt.title("Top Journals Publishing COVID-19 Papers")
plt.show()

# Word frequency in titles
words = " ".join(df_clean["title"].dropna()).lower().split()
word_freq = Counter(words).most_common(20)
print(word_freq)

# Word cloud of titles
text = " ".join(df_clean["title"].dropna())
wc = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()


The third part focuses on analyzing and visually displaying the data in a way that it can tell a story.