## Monkey Pox

In [1]:
## Import libraries

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
%matplotlib inline
import os


### Import Data from git

In [2]:
df_import = pd.read_csv('https://raw.githubusercontent.com/globaldothealth/monkeypox/main/latest.csv')
# df_import = pd.read_csv('monkeypox_data_outdated.csv.csv')
df_import

Unnamed: 0,ID,Status,Location,City,Country,Age,Gender,Date_onset,Date_confirmation,Symptoms,...,Travel_history_country,Genomics_Metadata,Confirmation_method,Source,Source_II,Date_entry,Date_last_modified,Source_III,Source_IV,Country_ISO3
0,1,confirmed,Guy's and St Thomas Hospital London,London,England,,,2022-04-29,2022-05-06,rash,...,Nigeria,West African Clade,RT-PCR,https://www.gov.uk/government/news/monkeypox-c...,https://www.who.int/emergencies/disease-outbre...,2022-05-18,2022-05-18,,,GBR
1,2,confirmed,Guy's and St Thomas Hospital London,London,England,,,2022-05-05,2022-05-12,rash,...,,West African Clade,RT-PCR,https://www.gov.uk/government/news/monkeypox-c...,,2022-05-18,2022-05-18,,,GBR
2,3,confirmed,London,London,England,,,2022-04-30,2022-05-13,vesicular rash,...,,West African Clade,RT-PCR,https://www.gov.uk/government/news/monkeypox-c...,,2022-05-18,2022-05-18,,,GBR
3,4,confirmed,London,London,England,,male,,2022-05-15,vesicular rash,...,,West African Clade,,https://www.gov.uk/government/news/monkeypox-c...,,2022-05-18,2022-05-18,,,GBR
4,5,confirmed,London,London,England,,male,,2022-05-15,vesicular rash,...,,West African Clade,,https://www.gov.uk/government/news/monkeypox-c...,,2022-05-18,2022-05-18,,,GBR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2160,2161,confirmed,Quebec,,Canada,,,,2022-06-14,,...,,,,https://montrealgazette.com/news/local-news/mo...,https://globalnews.ca/news/8919210/quebec-mont...,2022-06-14,2022-06-14,,,CAN
2161,2162,confirmed,Quebec,,Canada,,,,2022-06-14,,...,,,,https://montrealgazette.com/news/local-news/mo...,https://globalnews.ca/news/8919210/quebec-mont...,2022-06-14,2022-06-14,,,CAN
2162,2163,confirmed,Quebec,,Canada,,,,2022-06-14,,...,,,,https://montrealgazette.com/news/local-news/mo...,https://globalnews.ca/news/8919210/quebec-mont...,2022-06-14,2022-06-14,,,CAN
2163,2164,confirmed,Quebec,,Canada,,,,2022-06-14,,...,,,,https://montrealgazette.com/news/local-news/mo...,https://globalnews.ca/news/8919210/quebec-mont...,2022-06-14,2022-06-14,,,CAN


### Data Cleaning

In [3]:
df_import.columns

Index(['ID', 'Status', 'Location', 'City', 'Country', 'Age', 'Gender',
       'Date_onset', 'Date_confirmation', 'Symptoms', 'Hospitalised (Y/N/NA)',
       'Date_hospitalisation', 'Isolated (Y/N/NA)', 'Date_isolation',
       'Outcome', 'Contact_comment', 'Contact_ID', 'Contact_location',
       'Travel_history (Y/N/NA)', 'Travel_history_entry',
       'Travel_history_start', 'Travel_history_location',
       'Travel_history_country', 'Genomics_Metadata', 'Confirmation_method',
       'Source', 'Source_II', 'Date_entry', 'Date_last_modified', 'Source_III',
       'Source_IV', 'Country_ISO3'],
      dtype='object')

In [4]:
df = df_import[['ID','Date_confirmation', 'Location', 'City', 'Country','Status', 'Symptoms', 'Country_ISO3']]
df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms,Country_ISO3
0,1,2022-05-06,Guy's and St Thomas Hospital London,London,England,confirmed,rash,GBR
1,2,2022-05-12,Guy's and St Thomas Hospital London,London,England,confirmed,rash,GBR
2,3,2022-05-13,London,London,England,confirmed,vesicular rash,GBR
3,4,2022-05-15,London,London,England,confirmed,vesicular rash,GBR
4,5,2022-05-15,London,London,England,confirmed,vesicular rash,GBR
...,...,...,...,...,...,...,...,...
2160,2161,2022-06-14,Quebec,,Canada,confirmed,,CAN
2161,2162,2022-06-14,Quebec,,Canada,confirmed,,CAN
2162,2163,2022-06-14,Quebec,,Canada,confirmed,,CAN
2163,2164,2022-06-14,Quebec,,Canada,confirmed,,CAN


In [5]:
# I want only value which is confirmed. In this dataset, only the confirmed case has date. So, I just drop na in the Date_confirmation column.
# only_con_df = df[df["Date_confirmation"].notna()] # old doesnt work anymore
only_con_df = df[df["Status"] == "confirmed"]

In [6]:
# remove SettingWithCopyWarning: from showing
pd.options.mode.chained_assignment = None

In [7]:
only_con_df['case'] = only_con_df.loc[:, 'Status']
only_con_df['case'] = only_con_df['case'].replace('confirmed', '1').astype('int')
only_con_df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms,Country_ISO3,case
0,1,2022-05-06,Guy's and St Thomas Hospital London,London,England,confirmed,rash,GBR,1
1,2,2022-05-12,Guy's and St Thomas Hospital London,London,England,confirmed,rash,GBR,1
2,3,2022-05-13,London,London,England,confirmed,vesicular rash,GBR,1
3,4,2022-05-15,London,London,England,confirmed,vesicular rash,GBR,1
4,5,2022-05-15,London,London,England,confirmed,vesicular rash,GBR,1
...,...,...,...,...,...,...,...,...,...
2160,2161,2022-06-14,Quebec,,Canada,confirmed,,CAN,1
2161,2162,2022-06-14,Quebec,,Canada,confirmed,,CAN,1
2162,2163,2022-06-14,Quebec,,Canada,confirmed,,CAN,1
2163,2164,2022-06-14,Quebec,,Canada,confirmed,,CAN,1


In [8]:
only_con_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1806 entries, 0 to 2164
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ID                 1806 non-null   int64 
 1   Date_confirmation  1806 non-null   object
 2   Location           823 non-null    object
 3   City               404 non-null    object
 4   Country            1806 non-null   object
 5   Status             1806 non-null   object
 6   Symptoms           101 non-null    object
 7   Country_ISO3       1806 non-null   object
 8   case               1806 non-null   int32 
dtypes: int32(1), int64(1), object(7)
memory usage: 134.0+ KB


In [9]:
only_con_df['Date_confirmation'] = pd.to_datetime(only_con_df['Date_confirmation'])
only_con_df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms,Country_ISO3,case
0,1,2022-05-06,Guy's and St Thomas Hospital London,London,England,confirmed,rash,GBR,1
1,2,2022-05-12,Guy's and St Thomas Hospital London,London,England,confirmed,rash,GBR,1
2,3,2022-05-13,London,London,England,confirmed,vesicular rash,GBR,1
3,4,2022-05-15,London,London,England,confirmed,vesicular rash,GBR,1
4,5,2022-05-15,London,London,England,confirmed,vesicular rash,GBR,1
...,...,...,...,...,...,...,...,...,...
2160,2161,2022-06-14,Quebec,,Canada,confirmed,,CAN,1
2161,2162,2022-06-14,Quebec,,Canada,confirmed,,CAN,1
2162,2163,2022-06-14,Quebec,,Canada,confirmed,,CAN,1
2163,2164,2022-06-14,Quebec,,Canada,confirmed,,CAN,1


In [10]:
# The current data set doesnt have total confirm case for each country.

Total_case = only_con_df.groupby('Country', as_index=False).agg(Date_confirmation=('Date_confirmation', 'last'),case=('case',
                                                           lambda x: sum(x == x.iloc[-1])))

Total_case

Unnamed: 0,Country,Date_confirmation,case
0,Argentina,2022-06-09,3
1,Australia,2022-06-09,8
2,Austria,2022-06-13,3
3,Belgium,2022-06-08,24
4,Brazil,2022-06-11,3
5,Canada,2022-06-14,158
6,Czech Republic,2022-06-02,6
7,Denmark,2022-06-10,4
8,England,2022-06-12,452
9,Finland,2022-06-09,3


In [11]:
# sum_case = []
# counter = 0


# for i in only_con_df["Country"]:
# 	if 'England' in i:
# 		counter += 1
# 	sum_case.append(counter)

In [12]:
# Count the symptoms for each country and find the similarity ratio

sym_df = df[df["Date_confirmation"].notna()]
sym_df['Symptoms'] = sym_df['Symptoms'].str.lower()



sym_df = (sym_df.set_index('Country')['Symptoms']
       .str.split(',', expand=True)
       .apply(lambda x: x.str.strip())
       .stack()
       .rename('Country')
       .reset_index(name='Symptoms'))


sym_df = (sym_df.set_index('Country')['Symptoms']
       .str.split(';', expand=True)
       .apply(lambda x: x.str.strip())
       .stack()
       .rename('Country')
       .reset_index(name='Symptoms'))


sym_df = (sym_df.set_index('Country')['Symptoms']
       .str.split('and', expand=True)
       .apply(lambda x: x.str.strip())
       .stack()
       .rename('Country')
       .reset_index(name='Symptoms'))
sym_df


Unnamed: 0,Country,level_1,Symptoms
0,England,0,rash
1,England,0,rash
2,England,0,vesicular rash
3,England,0,vesicular rash
4,England,0,vesicular rash
...,...,...,...
161,Argentina,0,muscle pain
162,Argentina,0,back pain
163,Argentina,0,vasicular rashes
164,Brazil,0,skin lesions


In [13]:
count_sym_df = sym_df.groupby(['Symptoms','Country']).size().reset_index(name='Symptoms_count')
count_sym_df = count_sym_df.sort_values(by=['Symptoms','Symptoms_count'])
count_sym_df

Unnamed: 0,Symptoms,Country,Symptoms_count
0,an elevated temperature,Germany,1
1,back pain,Argentina,1
2,blisters,Finland,1
3,blisters,Italy,1
4,chills,Czech Republic,1
5,chills,United States,1
6,cough,Germany,1
7,cough,United States,1
8,enlarged lymph nodes,Brazil,1
9,fatigue,Czech Republic,1


In [14]:
to_merge = df[['Country', 'Country_ISO3']]
to_merge

Unnamed: 0,Country,Country_ISO3
0,England,GBR
1,England,GBR
2,England,GBR
3,England,GBR
4,England,GBR
...,...,...
2160,Canada,CAN
2161,Canada,CAN
2162,Canada,CAN
2163,Canada,CAN


In [15]:
to_merge = to_merge.drop_duplicates(['Country','Country_ISO3'])[['Country','Country_ISO3']]
to_merge.head()


Unnamed: 0,Country,Country_ISO3
0,England,GBR
7,Portugal,PRT
27,Spain,ESP
35,United States,USA
38,Canada,CAN


In [16]:
count_sym_df = pd.merge(left=count_sym_df, right=to_merge, how="left", on="Country")
count_sym_df

Unnamed: 0,Symptoms,Country,Symptoms_count,Country_ISO3
0,an elevated temperature,Germany,1,DEU
1,back pain,Argentina,1,ARG
2,blisters,Finland,1,FIN
3,blisters,Italy,1,ITA
4,chills,Czech Republic,1,CZE
5,chills,United States,1,USA
6,cough,Germany,1,DEU
7,cough,United States,1,USA
8,enlarged lymph nodes,Brazil,1,BRA
9,fatigue,Czech Republic,1,CZE


In [17]:
count_sym_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Symptoms        51 non-null     object
 1   Country         51 non-null     object
 2   Symptoms_count  51 non-null     int64 
 3   Country_ISO3    51 non-null     object
dtypes: int64(1), object(3)
memory usage: 2.0+ KB


In [18]:
sort_df = only_con_df.sort_values(by=['Country','Date_confirmation'])
sort_df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms,Country_ISO3,case
193,194,2022-05-27,,Buenos Aires,Argentina,confirmed,"pustules, fever",ARG,1
446,447,2022-05-27,Buenos Aires,,Argentina,confirmed,ulcerative lesions,ARG,1
1814,1815,2022-06-09,Buenos Aires,,Argentina,confirmed,"headache, muscle pain, back pain, vasicular ra...",ARG,1
106,107,2022-05-20,New South Wales,Sydney,Australia,confirmed,,AUS,1
107,108,2022-05-20,The Alfred Hospital,Melbourne,Australia,confirmed,genital rash,AUS,1
...,...,...,...,...,...,...,...,...,...
1868,1869,2022-06-12,,,Venezuela,confirmed,,VEN,1
350,351,2022-05-26,,,Wales,confirmed,,GBR,1
1041,1042,2022-06-03,,,Wales,confirmed,,GBR,1
1230,1231,2022-06-06,,,Wales,confirmed,,GBR,1


In [19]:
sort_df['sum_case'] = sort_df.groupby((sort_df['Country'] != sort_df['Country'].shift(1)).cumsum()).cumcount()+1
sort_df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms,Country_ISO3,case,sum_case
193,194,2022-05-27,,Buenos Aires,Argentina,confirmed,"pustules, fever",ARG,1,1
446,447,2022-05-27,Buenos Aires,,Argentina,confirmed,ulcerative lesions,ARG,1,2
1814,1815,2022-06-09,Buenos Aires,,Argentina,confirmed,"headache, muscle pain, back pain, vasicular ra...",ARG,1,3
106,107,2022-05-20,New South Wales,Sydney,Australia,confirmed,,AUS,1,1
107,108,2022-05-20,The Alfred Hospital,Melbourne,Australia,confirmed,genital rash,AUS,1,2
...,...,...,...,...,...,...,...,...,...,...
1868,1869,2022-06-12,,,Venezuela,confirmed,,VEN,1,1
350,351,2022-05-26,,,Wales,confirmed,,GBR,1,1
1041,1042,2022-06-03,,,Wales,confirmed,,GBR,1,2
1230,1231,2022-06-06,,,Wales,confirmed,,GBR,1,3


### Visualization

In [20]:
from datetime import date

date = date.today()

In [21]:
fig1 = px.bar(Total_case.sort_values('case', ascending=True), x='Country', y='case', text='case')

fig1.update_layout(
    width=1920,
    height=800,
    title_text=f'Monkey Pox: {date}'
)

fig1.show()

In [22]:
fig2 = px.scatter(sort_df, x='Date_confirmation',y='sum_case', color='Country')

fig2.update_layout(
    width=1920,
    height=800,
    title_text=f'Monkey Pox: {date}'
)

fig2.show()

In [23]:
fig3 = px.line(sort_df, x='Date_confirmation',y='sum_case', color='Country')

fig3.update_layout(
    width=1920,
    height=800,
    title_text=f'Monkey Pox: {date}'
)

fig3.show()

In [24]:
fig4 = px.line(sort_df[sort_df['Country'] == 'England'], x='Date_confirmation',y='sum_case', color='Country')

fig4.update_layout(
    width=1920,
    height=800,
    title_text=f'Monkey Pox UK: {date}'
)
fig4.show()

In [25]:
fig5 = px.scatter(count_sym_df.sort_values('Symptoms_count', ascending=False), x='Country', y='Symptoms', text='Symptoms')

fig5.update_traces(textposition='bottom center')

fig5.update_layout(
    width=1920,
    height=550,
    title_text=f'Monkey Pox Symptoms Per Country: {date}'
)

fig5.show()

In [26]:
fig6 = px.bar(count_sym_df, x="Symptoms", y="Symptoms_count", text='Country_ISO3',
                color='Country',
                labels={'Symptoms_count':'Symptoms Occurrences'})

fig6.update_layout(
    width=1920,
    height=800,
    title_text=f'Monkey Pox Count Symptoms Per Country: {date}'
)

fig6.show()

### Export

In [27]:
fig1.write_image(f"Graph/fig1.png")
fig2.write_image(f"Graph/fig2.png")
fig3.write_image(f"Graph/fig3.png")
fig4.write_image(f"Graph/fig4.png")
fig5.write_image(f"Graph/fig5.png")
fig6.write_image(f"Graph/fig6.png")

In [28]:
for i in range(1 , 7):
   print(i)

1
2
3
4
5
6


In [29]:
filename = str(date.year)+str(date.month)+str(date.day)
sort_df.to_csv(str('Export_dataset/Monkeypox_confirmed_' + filename + '.csv'))

In [30]:
count_sym_df.to_csv(str('Export_dataset/Monkeypox_total_symptoms_'+ filename + '.csv'))

In [31]:
Total_case.to_csv(str('Export_dataset/Monkeypox_total_last_case_'+ filename + '.csv'))

In [32]:
# for i in range(1 , 7):
#    f'fig{i}.write_image',(f"Graph/fig{i}.png")

In [33]:
# for i in range(1 , 7):
#     with open(f"Graph/fig{i}.png", "w") as file:
#             fig1.write_image(f"fig{i}")

In [34]:
# with open(f"Graph/fig6.png", "w") as file:
#         fig6.write_image(f"fig6")