# 1.Understanding the Data:

> Import necessary libraries (e.g., pandas, matplotlib, seaborn).

> Load the COVID-19 dataset.

> Display basic information about the dataset (e.g., data types, missing values).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('covid_19.csv')
df

In [None]:
df.info()

# 2. Data Cleaning:

> Handle missing values

> Convert data types if necessary ( ObservationDate    datetime64[ns]) change to datetime

> Check for duplicates

In [None]:
df.isna().sum()

In [None]:
df.fillna(0, inplace=True)

In [None]:
df['ObservationDate'] = pd.to_datetime(df['ObservationDate'])

In [None]:
print("Number of duplicates:", df.duplicated().sum())

# 3. Descriptive Statistics:

> Generate descriptive statistics (mean, median, min, max) for numeric columns.

> Explore summary statistics for categorical columns.

In [None]:
print("\nDescriptive Statistics for Numeric Columns:")
df.describe()

In [None]:
print("\nSummary Statistics for Categorical Columns:")
df.describe(include=['object'])

# 4. Time Series Analysis:

> Analyze the trends in confirmed, deaths, and recovered cases over time.

> Create time-based visualizations.

In [None]:
df_time = df.groupby('ObservationDate').agg({'Confirmed': 'sum', 'Deaths': 'sum', 'Recovered': 'sum'})

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=df_time, x='ObservationDate', y='Confirmed', label='Confirmed')
sns.lineplot(data=df_time, x='ObservationDate', y='Deaths', label='Deaths')
sns.lineplot(data=df_time, x='ObservationDate', y='Recovered', label='Recovered')
plt.title('COVID-19 Time Series Analysis')
plt.xlabel('Date')
plt.ylabel('Count')
plt.show()

# 5. Geographical Analysis:

> Analyze the distribution of cases across different countries/regions.

> Create geographical visualizations (e.g., world map).

In [None]:
df_country = df.groupby('Country/Region').agg({'Confirmed': 'sum', 'Deaths': 'sum', 'Recovered': 'sum'})

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(x=df_country.index, y='Confirmed', data=df_country)
plt.show()

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(x=df_country.index, y='Deaths', data=df_country)
plt.show()

# 6. Correlation Analysis:

> Correlation analysis between confirmed,deaths and recovered

> Plot correlation on heat map

In [None]:
correlation = df[['Confirmed', 'Deaths', 'Recovered']].corr()

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Between COVID-19 Variables')
plt.show()

# 7. Advanced Analysis:

> Calculate the total number of confirmed cases, deaths, and recoveries for each country/region.

> Identify the countries/regions with the highest number of confirmed cases, deaths, and recoveries.

> Visualize the distribution of deaths using bar chart

In [None]:
total_cases = df.groupby('Country/Region').agg({'Confirmed': 'sum', 'Deaths': 'sum', 'Recovered': 'sum'})
total_cases

In [None]:
top_confirmed = total_cases.nlargest(10, 'Confirmed')
top_deaths = total_cases.nlargest(10, 'Deaths')
top_recovered = total_cases.nlargest(10, 'Recovered')

In [None]:
plt.figure(figsize=(12, 6))
top_deaths['Deaths'].plot(kind='bar')
plt.xlabel('Country/Region')
plt.ylabel('Number of Deaths')
plt.title('Countries/Regions by Number of Deaths')
plt.show()