In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Step 2: Load the dataset
df = pd.read_csv('data/metadata.csv')

# Step 3: Display first few rows of the data
df.head()


In [None]:
# Check DataFrame dimensions (rows and columns)
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

# Check column data types
print(df.dtypes)

# Check for missing values
missing_data = df.isnull().sum()
print(missing_data[missing_data > 0])

# Summary statistics for numerical columns
print(df.describe())


In [None]:
# Drop rows with missing values
df_cleaned = df.dropna()

# Alternatively, you could fill missing values (for numeric columns):
df['column_name'] = df['column_name'].fillna(df['column_name'].mean())

# Checking missing values again
print(df_cleaned.isnull().sum())


In [None]:
# Convert 'publish_time' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')

# Extract the year from the publication date
df['year'] = df['publish_time'].dt.year


In [None]:
# Create new column with word count of abstracts
df['abstract_word_count'] = df['abstract'].apply(lambda x: len(str(x).split()))


In [None]:
# Count the number of papers per year
papers_per_year = df['year'].value_counts().sort_index()
print(papers_per_year)


In [None]:
# Plot number of publications per year
plt.figure(figsize=(10,6))
plt.bar(papers_per_year.index, papers_per_year.values)
plt.title('Publications by Year')
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.show()


In [None]:
# Count the top journals publishing COVID-19 research
top_journals = df['journal'].value_counts().head(10)
print(top_journals)

# Plot the top journals as a bar chart
top_journals.plot(kind='bar', figsize=(10,6))
plt.title('Top 10 Journals Publishing COVID-19 Research')
plt.xlabel('Journal')
plt.ylabel('Number of Papers')
plt.show()


In [None]:
df_cleaned.to_csv('cleaned_data.csv', index=False)
