# Covid-19 Vaccination EDA (India)

##### Requirements step (Uncomment the below code to install missing libary if not using jupyter or anaconda installation)

In [None]:
# !pip install numpy pandas matplotlib seaborn

#### Import the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Read the dataset

In [None]:
df = pd.read_csv('../data/processed/country_vaccinations.csv')
df.head()

In [None]:
df.shape
# rows = 86512, cols = 15

In [None]:
df.columns

In [None]:
dfi = df[df['country'] == 'India']
dfi.shape

In [None]:
dfi.head()

#### Data Preprocessing

#### 1) Handling Null values

In [None]:
dfi.isnull().sum()

In [None]:
dfi.dtypes

In [None]:
# nv = null value
nv_dfi = dfi.isnull().sum()
nv_dfi = nv_dfi[nv_dfi > 0]
nv_dfi

In [None]:
for i in nv_dfi.index:
    dfi[i] = dfi[i].fillna(dfi[i].median())

In [None]:
dfi.isnull().sum()

#### 2) Handling Duplicates

In [None]:
dfi.duplicated().sum()

#### 3) Check the data types

In [None]:
dfi.dtypes

In [None]:
dfi['date1'] = pd.to_datetime(dfi['date'])
dfi.dtypes

In [None]:
print(dfi.date1.min())  # 15 Jan, 21
print(dfi.date1.max())  # 29 Mar, 22

In [None]:
dfi.date.sort_values().head(10)

#### Drop Redundant Features

In [None]:
dfi.columns

In [None]:
dfi.head()

In [None]:
print(dfi['source_name'].value_counts())
print(dfi['source_website'].value_counts())
print(dfi['vaccines'].value_counts())

In [None]:
dfi.drop(['country', 'iso_code', 'source_name',
         'source_website', 'vaccines'], axis=1, inplace=True)
dfi.head()

In [None]:
dfi.columns

In [None]:
dfi['Year'] = dfi['date1'].dt.year
dfi['Qtr'] = dfi['date1'].dt.quarter
dfi['Month'] = dfi['date1'].dt.month  # month number
dfi.columns

In [None]:
dfi.head(15)

In [None]:
plt.plot(dfi['date1'], dfi['total_vaccinations'], label='Total Vaccinations')
plt.plot(dfi['date1'], dfi['people_vaccinated'], label='People vaccinated')
plt.legend()
plt.grid()
plt.show()

#### Qtr wise Total Vaccinations

In [None]:
dfi.columns

In [None]:
sns.barplot(x=dfi['Qtr'], y=dfi['total_vaccinations'])
plt.show()

In [None]:
plt.plot(dfi['date1'], dfi['daily_vaccinations_raw'],
         label='daily_vaccinations_raw')
plt.plot(dfi['date1'], dfi['daily_vaccinations'], label='daily_vaccinations')
plt.legend()
plt.grid()
plt.show()

In [None]:
sns.lineplot(x=dfi['date1'], y=dfi['daily_vaccinations_raw'],
             label='daily_vaccinations_raw')
sns.lineplot(x=dfi['date1'], y=dfi['daily_vaccinations'],
             label='daily_vaccinations')
plt.legend()
plt.grid()
plt.show()

In [None]:
dfi.columns

In [None]:
sns.lineplot(x=dfi['date1'], y=dfi['total_vaccinations_per_hundred'],
             label='total_vaccinations_per_hundred')
sns.lineplot(x=dfi['date1'], y=dfi['people_vaccinated_per_hundred'],
             label='people_vaccinated_per_hundred')
sns.lineplot(x=dfi['date1'], y=dfi['people_fully_vaccinated_per_hundred'],
             label='people_fully_vaccinated_per_hundred')
plt.legend()
plt.grid()
plt.show()

#### Month wise Total Vaccinations in decreasing order

In [None]:
dfi.columns

In [None]:
a1 = dfi.groupby('Month')['total_vaccinations'].mean()
a1

In [None]:
sns.barplot(x=a1.index, y=a1.values,
            order=a1.sort_values(ascending=False).index)
plt.title('Month wise Mean of Total Vaccinations')
plt.show()

In [None]:
dfi.head()

In [None]:
sns.jointplot(x=dfi['total_vaccinations'],
              y=dfi['people_fully_vaccinated'], color='orange')
plt.show()

In [None]:
sns.scatterplot(x=dfi['total_vaccinations'], y=dfi['people_fully_vaccinated'])
plt.show()

In [None]:
print(df['country'].nunique())
df['country'].value_counts()

###### Various Types of Variates 

1. Univarite : 
> countplot, boxplot, violinplot, kdeplot,histplot,distplot

2. Bivariate :
> barplot, countplot, boxplot, violinplot, lineplot, stripplot, swarmplot,heatmap, pairplot, scatterplot, jointplot

3. Multi-variate :
> Pairplot, heatmap, scatterplot, boxplot, violinplot, stripplot, swarmplot

The spine plots are the plots which are plotted using the tick marks or dots. These plots don’t have any boundaries. The Seborn.despine() method helps to remove the top and right spines in a plot.

In [None]:
titanic = sns.load_dataset("titanic")
titanic.head()

In [None]:
sns.stripplot(x="age", y="who", hue="alive", data=titanic)
# sns.despine()
plt.show()

In [None]:
sns.stripplot(x="age", y="who", hue="alive", data=titanic)
sns.despine()
plt.show()

In [None]:
sns.stripplot(x="age", y="who", hue="alive", data=titanic)
sns.despine(top=True, right=False, bottom=True)
plt.show()

In [None]:
sns.stripplot(x="age", y="who", hue="alive", data=titanic)
sns.despine(top=True, right=True, bottom=True, left=True)
plt.show()

In [None]:
sns.stripplot(x="age", y="who", hue="alive", data=titanic)
sns.despine(offset={"top": 3, "bottom": 5, "right": 6, "left": 4},
            top=False, right=False)
plt.show()

----

### End Of EDA

----