# Covid 19 Insights and Analytics

![][some-id]

[some-id]: https://images.unsplash.com/photo-1584483766114-2cea6facdf57?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D "Python Logo"

# Data Collection

We have collected the dataset from the publicly available dataset as provided by WHO(World Health Organization) for Covid 19 confirmed cases, death counts, and daily updated dataset. The dataset are fetched from the Github repositories freely available to everyone. The links mainly are:
- [Link 1 - Confirmed Cases](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv)
- [Link 2 - Deaths Globally](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv)
-  [Link 3 - Twitter Data](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv)
- [Link 4 - Global Vaccination](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv)

In [3]:

# Initial Imports
import random
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import chart_studio.plotly as py
import chart_studio.tools as tls
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

import plotly.express as px
import folium
from IPython.display import IFrame

import pymongo
from pymongo import MongoClient
from sqlalchemy import create_engine

%matplotlib inline
warnings.filterwarnings("ignore")

from wordcloud import WordCloud, ImageColorGenerator
from plotly.offline import iplot
import nltk
from nltk.tokenize import RegexpTokenizer

import nltk
import re
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Mongo DB connection

In [4]:
client = MongoClient("localhost",27017)

In [5]:
db = client.db_dap2
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'db_dap2')

In [7]:
#reading 3 files and storing them
dataset1 = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
dataset2 = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
dataset3 = pd.read_csv('data/covid19_tweets.csv')
dataset4 = pd.read_json('data/global.json')
dataset3.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [8]:
dataset1_dict = dataset1.to_dict(orient="records")
dataset2_dict = dataset2.to_dict(orient="records")
dataset3_dict = dataset3.to_dict(orient="records")
dataset4_dict = dataset4.to_dict(orient="records")

In [None]:
db.collection1.insert_many(dataset1_dict)
db.collection2.insert_many(dataset2_dict)
db.collection3.insert_many(dataset3_dict)
db.collection4.insert_many(dataset4_dict)

In [None]:
print(list(db.collection3.find().limit(1)))

In [None]:
collection1 = db["collection1"]
r1 = collection1.find()
conf_cases_df = pd.DataFrame(r1)

collection2 = db["collection2"]
r2 = collection2.find()
death_cases_df = pd.DataFrame(r2)

collection3 = db["collection3"]
r3 = collection3.find()
twitter = pd.DataFrame(r3)

collection4 = db["collection4"]
r4 = collection3.find()
vaccination_df = pd.DataFrame(r4)

In [None]:
def drop_id(dataframes):
    for i in dataframes:
        i.drop('_id',axis=1,inplace=True)
drop_id([conf_cases_df,
         death_cases_df,twitter
])

In [None]:
# Confirmed Cases of COVID 19
conf_cases_df.head(5)

In [None]:
# Death Cases of Covid19
death_cases_df.head(5)

In [None]:
# Twitter Tweets over coronavirus
twitter.head()

In [None]:
# Fetching the Vaccination data from the semistructured dataset as stored in MongoDB
# Connect to MongoDB
client = MongoClient('localhost', 27017)  # Update with your MongoDB connection details
db = client['db_dap2']  # Replace 'your_database_name' with your actual database name
collection = db['collection4']  # Replace 'your_collection_name' with your actual collection name

# Initialize empty lists to store extracted information
country_names = []
regions = []
income_levels = []
country_codes = []
populations = []
has_vaccines = []
income_level_name = []
response_date = []

# Data Retrival from MongoDB
cursor = collection.find()
for country_info in cursor:
    fields = country_info.get('fields', {})
    country_names.append(fields.get('countryName'))
    regions.append(fields.get('wbRegion'))
    income_levels.append(fields.get('wbIncomeLevelName'))
    country_codes.append(fields.get('wbCountryCode'))
    populations.append(fields.get('wbPopulation2019'))
    has_vaccines.append(fields.get('owidHasVaccine', False))
    income_level_name.append(fields.get('wbIncomeLevelName'))
    response_date.append(fields.get('mostRecentResponseDate'))

# DataFrame creation
vaccination_df = pd.DataFrame({'CountryName': country_names,
                                'Region': regions,
                                'IncomeLevel': income_levels,
                                'CountryCode': country_codes,
                                'Population': populations,
                                'HasVaccine': has_vaccines,
                                'IncomeLevelName': income_level_name,
                                'ResponseDate': response_date})

vaccination_df

### Checking the timeline

In [None]:
# Lets grab the dates columns only for now. 

death_cases_df.columns[4:]

### Null Values Checkpoint

In [None]:
# Now let's check the sum of the deaths for a particular country
death_cases_df.isna().sum()

In [None]:
death_cases_df.dropna(subset=['Lat'],inplace=True)

In [None]:
conf_cases_df.dropna(subset=['Lat'],inplace=True)

In [None]:
conf_cases_df.isna().sum()

# Postgres Integration
Now we connect and save the Datasets to Postgres

In [None]:
db_params = {
    "host": "localhost",
    "database": "dap",
    "user": "postgres",
    "password": "3679",
}
import psycopg2
conn = psycopg2.connect(**db_params)

In [None]:
cursor = conn.cursor()
engine = create_engine(f'postgresql://{db_params["user"]}:{db_params["password"]}@{db_params["host"]}/{db_params["database"]}')

In [None]:
df1 = conf_cases_df.copy()
df2 = death_cases_df.copy()
df3 = twitter.copy()
df4 = vaccination_df.copy()

In [None]:
dataframes = [df1,df2,df3,df4]
table_names = ["c1","c2","c3","c4"]

# Convert ObjectId to string in each DataFrame

# Insert data into PostgreSQL tables
for df, table_name in zip(dataframes, table_names):
    df.to_sql(table_name, engine, if_exists='replace', index=False)

# Commit the transaction
conn.commit()
# Close the connection
#conn.close()

In [None]:
### Fetching from postgres

In [None]:
dict1 = {}
def fetch(tables):
    count=4
    for i in tables:
        conn = psycopg2.connect(**db_params)
        query = (f"SELECT * FROM {i};")
        for i in range(count,0,-1):
            dict1[count] = pd.read_sql_query(query, conn)
            count-=1
            break
        # Display the DataFrame
        
        
fetch(['c1','c2','c3','c4'])

In [None]:
conf_cases_df = dict1[4]
death_cases_df = dict1[3]
twitter = dict1[2]
vaccination_df = dict1[1]

In [None]:
conf_cases_df.head(5)

In [None]:
death_cases_df.head(5)

# EDA - Visualizations -> Geographical Analytics Checkpoint

In [None]:
# folium map depicting the number of COVID cases all over the world
m = folium.Map(location=[death_cases_df['Lat'].mean(), death_cases_df['Long'].mean()], zoom_start=4)

for index, row in conf_cases_df.iterrows():
    folium.CircleMarker(location=[row['Lat'],row['Long']],
                       radius=row['3/9/23']/1000000,
                       color = 'Green',
                       fill = True,
                       fill_opacity = 0.15,
                       popup = f"{row['Country/Region']}: {row['3/9/23']} deaths").add_to(m)
    
m.save('covid_deaths_map.html')

IFrame(src='./covid_deaths_map.html', width=1700, height=600)   

In [None]:
# Country wise color distributed death count representation over the world
fig = px.choropleth(death_cases_df, locations="Country/Region", locationmode = "country names",
                    color="3/9/23",
                    hover_name="Country/Region",
                    scope = 'world',
                    template = 'plotly_dark',
                    color_continuous_scale="Blues")
fig.show()

In [None]:
# Country wise color distributed death count representation over asia
fig = px.choropleth(death_cases_df, locations="Country/Region", locationmode = "country names",
                    color="3/9/23",
                    hover_name="Country/Region",
                    scope = 'asia',
                    template = 'plotly_dark',
                    color_continuous_scale="Viridis")
fig.show()

In [None]:
# Country wise color distributed death count representation over Europe

fig = px.choropleth(death_cases_df, locations="Country/Region", locationmode = "country names",
                    color="3/9/23",
                    hover_name="Country/Region",
                    scope = 'europe',
                    template = 'plotly_dark',
                    color_continuous_scale="Blues")
fig.show()

In [None]:
# Country wise color distributed death count representation over North America

fig = px.choropleth(death_cases_df, locations="Country/Region", locationmode = "country names",
                    color="3/9/23",
                    hover_name="Country/Region",
                    scope = 'north america',
                    template = 'plotly_dark',
                    color_continuous_scale="Greens")
fig.show()

In [None]:
# Country wise color distributed death count representation over South America

fig = px.choropleth(death_cases_df, locations="Country/Region", locationmode = "country names",
                    color="3/9/23",
                    hover_name="Country/Region",
                    scope = 'south america',
                    template = 'plotly_dark',
                    color_continuous_scale="Greens")
fig.show()

## Dataframe Manipulation
> Now here we wanted to show the increase in the deaths overtime for each and every country with the help of a animation code using plotly. However, our code wasn't able to do so as the data is in a format where the dates are in the column names. The plotly arguments take the date as a instance value present in the dataframe. So we need to fetch the date data into the dataframe. 
>> We also need to keep in mind the length of the data as well. The dates that we have are almost above thousands of them ranging from 2020 to 2023. So to create a melted dataframe for each and every date would create a very large dataset that would be difficult to work with. So skipping a few dates might be a good idea when our target is to show the progress/increase in deaths over time. 

In [None]:
# Checking the dates in the columns here once for idea
list(death_cases_df.columns[4:15]),len(list(death_cases_df.columns[4:]))

**So the count of date elements present in our dataframe is *1143***
> **That means 1143 days**

In [None]:
from datetime import datetime
date_objects = [datetime.strptime(date_str, '%m/%d/%y') for date_str in list(death_cases_df.columns[4:])]

In [None]:
len(date_objects),len(death_cases_df)

> ### Headnote for the upcoming task
Ok now to show the animation of the scatter plot changing over time with the deaths accumulated for the given time period we need to change the dataset a bit. we are not going to change the actual data. however make a copy of the dataframe and then add rows into it as per given date columns. 

In [None]:
# Earlier we made the date objects so as to know how many dates we are working with. since
# we are working with a lot of data we need to cut short the dates by keeping dates with a 30 days gap.

In [None]:
len(date_objects[::10])

In [None]:
animation_death_df = pd.melt(death_cases_df, id_vars=['Province/State','Country/Region','Lat','Long'], var_name='Date', value_name='Deaths')

In [None]:
animation_death_df[animation_death_df['Country/Region']=='Afghanistan'].head(5)

In [None]:
columns_to_take = list(death_cases_df.columns[:4])
for i in list(death_cases_df.columns[4::]):
    columns_to_take.append(i)
columns_to_take[:10]

In [None]:
for_animation = death_cases_df[columns_to_take].copy()
for_animation['Total Death'] = death_cases_df['3/9/23']
for_animation.head(5)

In [None]:
# Here we remove all the columns except the columns that have the dates
new_columns_to_take = list(for_animation.columns)
new_columns_to_take = new_columns_to_take[4:]
new_columns_to_take[:10]

In [None]:
# for_animation = for_animation.drop('Province/State',axis=1)
agg_dict = {col:'sum' for col in new_columns_to_take}
agg_dict['Lat'] = 'first'
agg_dict['Long'] = 'first'
for_animation = for_animation.groupby('Country/Region').agg(agg_dict).reset_index()
for_animation.head()

In [None]:
new_columns_to_take = list(for_animation.columns[:1])
for i in list(for_animation.columns[1:-3:10]):
    new_columns_to_take.append(i)
new_columns_to_take.append('Lat')
new_columns_to_take.append('Long')
new_columns_to_take.append('Total Death')
print(new_columns_to_take[1:4],new_columns_to_take[-2:],len(new_columns_to_take))
for_animation = for_animation[new_columns_to_take]
for_animation.head(5)

In [None]:
animation_death_df = pd.melt(for_animation, id_vars=['Country/Region','Lat','Long','Total Death'], var_name='Date', value_name='Deaths')
animation_death_df['Date'] = pd.to_datetime(animation_death_df['Date']).dt.date
animation_death_df = animation_death_df.sort_values(by=['Country/Region','Date'])

animation_death_df.head(5)

Well our Dataset is now ready! Lets head over to the animation code!
### Scatter plot over time showing the increasing death count.

In [None]:
px.scatter(data_frame=animation_death_df,
          x = 'Deaths',
          y = 'Total Death',
          size='Deaths',
          color='Country/Region',
          title='COVID19 Global Deaths Analytics 2020-2023',
          labels={'Deaths':'Total Deaths till Date',
                  'Lat':'Latitude'},
          log_x=False,log_y=True,
          hover_name = 'Country/Region',
          animation_frame='Date',
          range_x=[-500,1300000],
          range_y=[40000,2400000],
          size_max=20*3)

In [None]:
animation_death_df.head(5)

## Further EDA for Modelling
Ok now after some of the visualizations made above we now need to actually look through the datasets and form the connections for future tasks. Let's do that now

In [None]:
columns_confirmed = conf_cases_df.keys()[4:]
columns_deaths = death_cases_df.keys()[4:]

columns_confirmed, columns_deaths

In [None]:
conf_dates_df = conf_cases_df[columns_confirmed].copy()
death_dates_df = death_cases_df[columns_deaths].copy()
death_dates_df.head(5)

In [None]:
# Ok so as counted before we should have 1143 dates but lets check again
total_days = len(columns_confirmed)

In [None]:
conf_keys = conf_dates_df.keys()
death_keys = death_dates_df.keys()

global_cases = []
total_deaths = []
fatality_rate = []

for i in range(len(columns_confirmed)):
    conf_sum = conf_dates_df[conf_keys[i]].sum()
    death_sum = death_dates_df[death_keys[i]].sum()
    
    global_cases.append(conf_sum)
    total_deaths.append(death_sum)
    
    # fatality rate is calculated here
    fatality_rate.append(death_sum/conf_sum)


In [None]:
# ok now we check the increase in death on a daily basis and the average
def regular_increment(data):
    return [data[i] if i == 0 else data[i] - data[i-1] for i in range(len(data))]

def increment_average(data, span):
    return [np.mean(data[i:i+span]) if i + span < len(data) else np.mean(data[i:len(data)]) for i in range(len(data))]


In [None]:
# now lets grab the everyday increase in cases and the continuously increasing average to see how fast the increase is

# confirmed cases
span = 7
global_daily_increment = regular_increment(global_cases)
global_conf_avg = increment_average(global_cases, span)
global_daily_increment_avg = increment_average(global_daily_increment, span)

# deaths
global_daily_death = regular_increment(total_deaths)
global_death_avg = increment_average(total_deaths, span)
global_daily_death_avg = increment_average(global_daily_death, span)

In [None]:
no_of_days = np.array([i for i in range(len(conf_keys))]).reshape(-1, 1)
type(no_of_days[0][0])

In [None]:
global_cases = np.array(global_cases).reshape(-1, 1)
total_deaths = np.array(total_deaths).reshape(-1, 1)

In [None]:
future_target_days = 10
forcasting_days = np.arange(len(conf_keys) + future_target_days).reshape(-1, 1)
current_days = forcasting_days[:-future_target_days]

In [None]:
current_days,forcasting_days

In [None]:
import datetime

start = '1/22/2020'
start_date = datetime.datetime.strptime(start, '%m/%d/%Y')
forcasting_dates = [(start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y') for i in range(len(forcasting_days))]

In [None]:
# Lets check if we got the future days beyond the current timeline of our dataset has been created correctly or not
forcasting_dates[-10:]

In [None]:
def to_flatten(arr):
    return [i[0] for i in arr.tolist()]

In [None]:
current_days = current_days.reshape(1, -1)[0]
current_days.shape

In [None]:
current_days = current_days.reshape(1, -1)[0]
sns.set_theme('paper')
plt.figure(figsize=(16, 8))
plt.plot(current_days, global_cases)
plt.plot(current_days, global_conf_avg, linestyle='dashed', color='orange')
plt.title('Coronavirus Cases Over Time', size=30)
plt.xlabel('Since Jan 2020', size=30)
plt.ylabel('No of Cases', size=30)
plt.legend(['Global Coronavirus Cases', 'Increment Average {} Days'.format(span)], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()

plt.figure(figsize=(16, 8))
plt.plot(current_days, total_deaths)
plt.plot(current_days, global_death_avg, linestyle='dashed', color='orange')
plt.title('Coronavirus Deaths Over Time', size=30)
plt.xlabel('Since Jan 2020', size=30)
plt.ylabel('No of Cases', size=30)
plt.legend(['Global Coronavirus Deaths', 'Increment Average {} Days'.format(span)], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()

In [None]:
import matplotlib.pyplot as plt

span = 7

# Set the style to 'classic' to remove Seaborn styling
plt.style.use('classic')

plt.figure(figsize=(16, 10))

# Bar plot for World Daily Increases in Confirmed Cases
plt.bar(current_days, global_daily_increment, color='black')
plt.plot(current_days, global_daily_increment_avg, color='yellow', linestyle='dashed',linewidth=2.5)
plt.title('Global Daily Increases in Confirmed Cases', size=30)
plt.xlabel('Since Jan 2020', size=30)
plt.ylabel('No. of Cases', size=30)
plt.legend(['Moving Average {} Days'.format(span), 'World Daily Increase in COVID-19 Cases'], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()

# Bar plot for World Daily Increases in Confirmed Deaths
plt.figure(figsize=(16, 10))
plt.bar(current_days, global_daily_death, color='black')
plt.plot(current_days, global_daily_death_avg, color='yellow', linestyle='dashed',linewidth=2.5)
plt.title('Global Daily Increases in Confirmed Deaths', size=30)
plt.xlabel('Since Jan 2020', size=30)
plt.ylabel('No. of Cases', size=30)
plt.legend(['Moving Average {} Days'.format(span), 'World Daily Increase in COVID-19 Deaths'], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()


In [None]:
def country_info(country):
    conf = [conf_cases_df[conf_cases_df['Country/Region'] == country][conf_keys[i]].sum() for i in range(total_days)]
    deaths = [death_cases_df[death_cases_df['Country/Region'] == country][death_keys[i]].sum() for i in range(total_days)]
    return conf, deaths
    
    
def country_visualizations(country):
    country_info = country_info(country)
    country_conf = country_info[0]
    country_deaths = country_info[1]
    
    country_daily_increment = daily_increase(country_conf)
    country_daily_death = daily_increase(country_deaths)
    
    country_plot(current_days, country_conf, country_daily_increment, country_daily_death, country)

In [None]:
def plot_country_data(data, countries, graph_name):
    sns.set(style="darkgrid")
    for num, title in enumerate(graph_name):
        plt.figure(figsize=(12, 8))
        for country in countries:
            plt.plot(data[country][num])
        plt.legend(countries, prop={'size': 20})
        plt.xlabel('Since Jan 2020', size=20)
        plt.ylabel('No of Cases', size=20)
        plt.title(title, size=30)
        plt.xticks(size=20)
        plt.yticks(size=20)
        plt.show()

# Example usage:
compare_countries = ['India', 'US', 'Brazil', 'Russia', 'United Kingdom', 'France']
graph_names = ['Coronavirus Confirmed Cases', 'Coronavirus Confirmed Deaths']

country_data = {country: country_info(country) for country in compare_countries}

plot_country_data(country_data, compare_countries, graph_names)


In [None]:
unique_countries =  list(conf_cases_df['Country/Region'].unique())
unique_countries

In [None]:
import operator

country_conf_cases = []
country_death_cases = [] 
country_active_cases = [] 
country_mortality_rate = [] 

no_cases = []
for i in unique_countries:
    cases_conf = conf_cases_df[conf_cases_df['Country/Region']==i]['3/9/23'].sum()
    cases_death = death_cases_df[death_cases_df['Country/Region']==i]['3/9/23'].sum()
    if cases_conf > 0:
        country_conf_cases.append(cases_conf)
        country_death_cases.append(cases_death)
    else:
        no_cases.append(i)
        
for i in no_cases:
    unique_countries.remove(i)
    
# sort countries by the number of confirmed cases
unique_countries = [k for k, v in sorted(zip(unique_countries, country_conf_cases), key=operator.itemgetter(1), reverse=True)]
for i in range(len(unique_countries)):
    country_conf_cases[i] = conf_cases_df[conf_cases_df['Country/Region']==unique_countries[i]]['3/9/23'].sum()
    country_death_cases[i] = (death_cases_df[death_cases_df['Country/Region']==unique_countries[i]]['3/9/23'].sum())
    country_mortality_rate.append(country_death_cases[i]/country_conf_cases[i])

In [None]:
top_unique_countries = [] 
top_conf_cases = []
top_death_cases = []
others_conf = np.sum(country_conf_cases[10:])
others_death = np.sum(country_death_cases[10:])

for i in range(len(country_conf_cases[:10])):
    top_unique_countries.append(unique_countries[i])
    top_conf_cases.append(country_conf_cases[i])
    top_death_cases.append(country_death_cases[i])
top_unique_countries.append('Others')
top_conf_cases.append(others_conf)
top_death_cases.append(others_death)

In [None]:
fig = px.pie(pd.DataFrame(top_unique_countries, top_conf_cases),
       names = top_unique_countries,
       values = top_conf_cases,
           color_discrete_sequence=px.colors.sequential.Brwnyl_r, title="Covid19 Cases Across Countries" )
fig.update_layout(title_x=0.5)

In [None]:
fig = px.pie(pd.DataFrame(top_unique_countries, top_death_cases),
       names = top_unique_countries,
       values= top_death_cases,
           color_discrete_sequence=px.colors.sequential.Brwnyl_r, title="Covid19 Deaths across Countries" )
fig.update_layout(title_x=0.5)

# More Global Json File Visualizations

In [None]:
vaccination_df

In [None]:
vaccination_df.rename(columns={'CountryName':'Country/Region'},inplace=True)

In [None]:
len(vaccination_df['Country/Region'].unique())

In [None]:
merged_df = pd.merge(vaccination_df, death_cases_df, on='Country/Region')
merged_df


In [None]:
plt.figure(figsize=(14, 8))
sns.barplot(x='Region', y='3/9/23', data=merged_df, palette='viridis')
plt.title('Population by Region')
plt.xlabel('Region')
plt.ylabel('Death Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming your DataFrame is named df
# If not, replace df with your actual DataFrame variable

# Group by Region and count the occurrences of True and False in HasVaccine
vaccine_data = merged_df.groupby(['Region', 'HasVaccine']).size().unstack()

# Set the figure size
plt.figure(figsize=(10, 6))

# Get the number of regions
num_regions = len(vaccine_data)

# Create an array of bar positions
bar_positions = np.arange(num_regions)

# Plot the stacked bar chart
bottom = None
for vaccine_status in vaccine_data.columns:
    plt.bar(bar_positions, vaccine_data[vaccine_status], label=vaccine_status, bottom=bottom)
    if bottom is None:
        bottom = np.zeros(num_regions)
    bottom += vaccine_data[vaccine_status]

# Add labels and title
plt.title('Vaccine Status by Region')
plt.xlabel('Region')
plt.ylabel('Count')

# Rotate x-axis labels for better readability
plt.xticks(bar_positions, vaccine_data.index, rotation=45, ha='right')

plt.legend()

# Show the plot
plt.show()


# Modelling

In [None]:
# For the models to identify the patterns let's modify the data accordingly
hop_days = 926
x_train_conf, x_test_conf, y_train_conf, y_test_conf = train_test_split(no_of_days[hop_days:-50], 
                                                                        global_cases[hop_days:-50],
                                                                        test_size=0.07,
                                                                        shuffle=False)

In [None]:
conf_cases_df.columns[926:-53]

OK now we have the dataset with us that is divided into training and test set. We now use a pipeline to train both the models and use **named_steps** to access the transformed features. We then use the transformed features from fetched through the pipeline to calculate the **MeanSquaredError(MSE)**

# Model Training - Randomized Search CV

In [None]:
# First we transform the data transform our data for polynomial regression
poly_reg = PolynomialFeatures(degree=3)
bayesian_reg_x_train_conf = poly_reg.fit_transform(x_train_conf)
bayesian_reg_x_test_conf = poly_reg.fit_transform(x_test_conf)
bayesian_reg_forcasting_days = poly_reg.fit_transform(forcasting_days)

tol = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
alpha_1 = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
alpha_2 = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
lambda_1 = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
lambda_2 = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
normalize = [True, False]

bayesian_grid = {'tol': tol, 'alpha_1': alpha_1, 'alpha_2' : alpha_2, 
                 'lambda_1': lambda_1, 'lambda_2' : lambda_2}

bayesian_model = BayesianRidge(fit_intercept=True)
bayesian_search = RandomizedSearchCV(bayesian_model, 
                                     bayesian_grid, 
                                     scoring='neg_mean_squared_error', 
                                     cv=3, 
                                     return_train_score=True, 
                                     n_jobs=-1, 
                                     n_iter=40, verbose=1)
bayesian_search.fit(bayesian_reg_x_train_conf, y_train_conf)

In [None]:
bayesian_search.best_params_

### Model Prediction
Ok let's do the prediction now with the models we have created. First we do the estimation with the model that was created using GridSearchCV.

In [None]:
best_model = bayesian_search.best_estimator_
test_bayesian_prediction = best_model.predict(bayesian_reg_x_test_conf)
bayesian_forcasting_pred = best_model.predict(bayesian_reg_forcasting_days)
print('MAE:', mean_absolute_error(test_bayesian_prediction, y_test_conf))
print('MSE:',mean_squared_error(test_bayesian_prediction, y_test_conf))

In [None]:
plt.plot(y_test_conf)
plt.plot(test_bayesian_prediction)
plt.legend(['Test Data', 'Bayesian Ridge Polynomial Predictions'])

In [None]:
svm_confirmed = SVR(shrinking=True, kernel='poly',gamma=0.01, epsilon=1,degree=3, C=0.1)
svm_confirmed.fit(x_train_conf, y_train_conf)
svm_pred = svm_confirmed.predict(forcasting_days)

svm_test_pred = svm_confirmed.predict(x_test_conf)
plt.plot(y_test_conf)
plt.plot(svm_test_pred)
plt.legend(['Test Data', 'SVM Predictions'])
print('MAE:', mean_absolute_error(svm_test_pred, y_test_conf))
print('MSE:',mean_squared_error(svm_test_pred, y_test_conf))

In [None]:
def plot_predictions(x, y, pred, algo_name, color):
    plt.figure(figsize=(12, 8))
    plt.plot(x, y)
    plt.plot(forcasting_days, pred, linestyle='dashed', color=color)
    plt.title('Global Coronavirus Cases Over Time', size=30)
    plt.xlabel('Since Jan 2020', size=30)
    plt.ylabel('No. of Cases', size=30)
    plt.legend(['Confirmed Cases', algo_name], prop={'size': 20})
    plt.xticks(size=20)
    plt.yticks(size=20)
    plt.show()

In [None]:
plot_predictions(current_days, global_cases, svm_pred, 'SVM Predictions', 'purple')

In [None]:
plot_predictions(current_days, global_cases, bayesian_forcasting_pred, 'Bayesian Ridge Regression Predictions', 'green')

In [None]:
svm_df = pd.DataFrame({'Date': forcasting_dates[-10:],
                       'SVM Predictions - No. of Global Confirmed Cases': np.round(svm_pred[-10:])})
svm_df.style.background_gradient(cmap='Greens')

In [None]:
bayesian_df = pd.DataFrame({'Date': forcasting_dates[-10:], 
                            'Bayesian Ridge Predictions - No of Confirmed Cases': np.round(bayesian_forcasting_pred[-10:])})
bayesian_df.style.background_gradient(cmap='Greens')

# Twitter Data Manipulation and Visualizations

In [None]:
# Now lets look at our twitter data
twitter_df = twitter[:50000].copy()

In [None]:
filtered_text = ",".join(review for review in twitter_df.text if 'COVID' not in review and 'https' not in review and 'Covid' not in review)
wordcloud = WordCloud(max_words=200, colormap='Set3', background_color="black",width=1600, height=800).generate(filtered_text)

plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
print('Count of covid19 tweets: {}\n'.format(twitter_df.shape[0]))
print('Unique user posts: {}\n'.format(twitter_df.user_name.nunique()))
print('Unique location posts: {}\n'.format(twitter_df.user_location.nunique()))
print('Users with more than 1 million followers(higher chances of spread): {}\n'.format(twitter_df[twitter_df['user_followers']>1000000].user_name.nunique()))

### Description of Data

In [None]:
twitter_df.describe()

In [None]:
plt.rcParams['figure.figsize'] = 8, 5
plt.style.use("fivethirtyeight")
pd.options.plotting.backend = "plotly"

fig = twitter_df.isnull().sum().reset_index().plot(kind='bar', x=0, y='index',color='index',color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_layout(title='Mising Values Plot', xaxis_title='Count', yaxis_title='Column Names')
fig.show()

From the above data we can tell that user location and user description wouldn't be as useful in deciding the sentiments of the tweets so no need to handle them.

In [None]:
fig = px.box(twitter_df, y="user_followers", color="user_verified",
                   title="User Followers Distribution")
fig.show()

# Twitter Data Cleaning
Ok now do some cleaning of our unstructured tweet data

In [None]:
import string

def clean_text(text):
    return re.sub('\s+', ' ', re.sub('[%s]' % re.escape(string.punctuation), '', re.sub('<.*?>|https?://\S+|www\.\S+|\[.*?\]|\w*\d\w*', '', text.lower()))).strip()


In [None]:
twitter_df['text'] = twitter_df['text'].apply(lambda x: ' '.join(RegexpTokenizer(r'\w+').tokenize(clean_text(x.lower()))))


In [None]:
twitter_df['hashtag_count'] = twitter_df['hashtags'].str.split(',').str.len()
twitter_df['hashtag_count'] = twitter_df['hashtag_count'].fillna(0.0)
fig = twitter_df.hashtag_count.value_counts().reset_index().head(7).plot(kind='bar', x='hashtag_count', y='count', color='hashtag_count',color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(title='Hashtag Count Distribution', xaxis_title='Hashtag Counts', yaxis_title='')
fig.show()

### Let's explore the whole dataset now

In [None]:
fig = twitter_df['text'].str.split().str.len().plot(kind='hist')
fig.update_layout(title=dict(text="Word Count Distribution", font=dict(size=35)), xaxis_title='Word Count', yaxis_title='')
fig.show()

In [None]:

def show_top_few_words(corpus, n=None):
    words_frequency = CountVectorizer().fit(corpus).transform(corpus).sum(axis=0)
    words_frequency = [(word, words_frequency[0, idx]) for word, idx in CountVectorizer().fit(corpus).vocabulary_.items()]
    return sorted(words_frequency, key=lambda x: x[1], reverse=True)[:n]

common_words = show_top_few_words(twitter_df['text'], 15)
df1 = pd.DataFrame(common_words, columns=['text', 'count'])
fig = px.bar(df1, x='text', y='count', title='Before Removing Stop Words', labels={'count': 'Count'},
             color_discrete_sequence=px.colors.sequential.Viridis)
fig.update_layout(title=dict(text="Before Removing Stop Words", font=dict(size=30)))
fig.show()


In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words(twitter_df['text'], 15)
    
df1 = pd.DataFrame(common_words, columns = ['text' , 'count'])
fig = px.bar(df1, x='text', y='count', title='After Removing Stop Words', labels={'count': 'Count'},
             color_discrete_sequence=px.colors.sequential.Viridis)
fig.update_layout(title=dict(text="After Removing Stop Words", font=dict(size=30)))
fig.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def with_bigram(corpus, n=None):
    vectorizer = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vectorizer.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_frequency = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_frequency = sorted(words_frequency, key=lambda x: x[1], reverse=True)
    return words_frequency[:n]

common_words = with_bigram(twitter_df['text'], 20)

df1 = pd.DataFrame(common_words, columns=['text', 'count'])
fig = px.bar(df1, y='text', x='count', color='count',color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(yaxis_title='bigrams', title='Top 20 bigrams before removing stop words')
fig.update_layout(title=dict(text="Top 20 bigrams before removing stop words", font=dict(size=30)))
fig.show()


In [None]:
def without_bigram(corpus, n=None):
    vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vectorizer.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_frequency = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_frequency =sorted(words_frequency, key = lambda x: x[1], reverse=True)
    return words_frequency[:n]

common_words = without_bigram(twitter_df['text'], 20)

df1 = pd.DataFrame(common_words, columns = ['text' , 'count'])
fig = px.bar(df1, y='text', x='count', color='count',color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(yaxis_title='bigrams', title='Top 20 bigrams after removing stop words')
fig.update_layout(title=dict(text="Top 20 bigrams after removing stop words", font=dict(size=30)))
fig.show()


now we do some Sentimental Analysis with Sentimental analysis model

In [None]:
model = SentimentIntensityAnalyzer()

def sentiment_score(txt):
    return model.polarity_scores(txt)['compound']

twitter_df["sentiment_score"] = twitter_df["text"].apply(sentiment_score)

In [None]:
twitter_df['sentiment_score'].head(5)

In [None]:
fig = px.violin(twitter_df, y="sentiment_score", color="user_verified",
                   title="Sentiment Score Distribution")
fig.update_layout(title=dict(text="Sentiment Score Distribution", font=dict(size=30)))

fig.show()

The data looks quite evenly distributed

In [None]:
dummy_df = twitter_df[twitter_df['sentiment_score']>0.5]

fig = dummy_df['user_location'].value_counts().reset_index().head(10).plot(kind='bar', y='count', x='user_location', color='user_location')
fig.update_layout(title='Most positive Tweets orgin Countries', xaxis_title='Location', yaxis_title='')
fig.update_layout(title=dict(text="Most positive Tweets orgin Countries", font=dict(size=30)))
fig.show()

In [None]:
twitter_df

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
# Truncated SVD is a stochastic algorithm. We set the random seed to ensure a consistant output.
np.random.seed(0)
vectorizer = TfidfVectorizer(stop_words='english')
# This matrix has been normalized under default settings
tfidf_matrix = vectorizer.fit_transform(twitter_df.text)

In [None]:
# Truncated SVD is a stochastic algorithm. We set the random seed to ensure a consistant output.
np.random.seed(0)
lsa_matrix = TruncatedSVD(n_components=100).fit_transform(tfidf_matrix)

In [None]:
np.random.seed(0)

# Assuming twitter_df['text'] is a column containing text data
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(twitter_df['text'])

# Perform Truncated SVD
lsa_matrix = TruncatedSVD(n_components=100).fit_transform(tfidf_matrix)

# Normalize the matrix


# Apply KMeans clustering
clusters = KMeans(n_clusters=20).fit_predict(lsa_matrix)


In [None]:
twitter_df['cluster'] = clusters
# Clusters are stored as DataFrames for easier analysis.
cluster_groups = [df_cluster for  _, df_cluster in twitter_df.groupby('cluster')]

In [None]:
cluster_groups[0].head()

In [None]:
def cluster_to_image(df_cluster, max_words=10, tfidf_matrix=tfidf_matrix,
                     vectorizer=vectorizer):
    indices = df_cluster.index.values
    summed_tfidf = np.asarray(tfidf_matrix[indices].sum(axis=0))[0]
    data = {'Word': vectorizer.get_feature_names_out(),'Summed TFIDF': summed_tfidf}  
    # Words are ranked by their summed TFIDF values.
    df_ranked_words = pd.DataFrame(data).sort_values('Summed TFIDF', ascending=False)
    words_to_score = {word: score
                     for word, score in df_ranked_words[:max_words].values
                     if score != 0}
    
    # The word cloud's color parameters are modefied to maximize readability.
    cloud_generator = WordCloud(background_color='white',
                                color_func=_color_func,
                                random_state=1)
    wordcloud_image = cloud_generator.fit_words(words_to_score)
    return wordcloud_image

def _color_func(*args, **kwargs):
    # This helper function will randomly assign one of 5 easy-to-read colors to each word.
    return np.random.choice(['black', 'blue', 'teal', 'purple', 'brown'])


In [None]:
def plot_wordcloud_grid(cluster_groups, num_rows=5, num_columns=4):
    plt.style.use('classic')
    # This function plots all clusters as word-clouds in 5x4 subplot grid.
    figure, axes = plt.subplots(num_rows, num_columns, figsize=(20, 15))
    cluster_groups_copy = cluster_groups[:]
    for r in range(num_rows):
        for c in range(num_columns):
            if not cluster_groups_copy:
                break
                
            df_cluster = cluster_groups_copy.pop(0)
            wordcloud_image = cluster_to_image(df_cluster)
            ax = axes[r][c]
            ax.imshow(wordcloud_image, interpolation="bilinear")   
            # The title of each subplot contains the cluster id, as well as the cluster size.
            ax.set_title(f"Cluster {df_cluster.cluster.iloc[0]}: {df_cluster.shape[0]}")
            ax.set_xticks([])
            ax.set_yticks([])

plot_wordcloud_grid(cluster_groups)

plt.show()