## Monkey Pox

In [228]:
# ## Import libraries

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
%matplotlib inline
import seaborn as sns
import missingno as msno

# # import os

### Import Data from git

In [229]:
# df_import = pd.read_csv('https://raw.githubusercontent.com/globaldothealth/monkeypox/main/latest.csv')
df_import = pd.read_csv('monkeypox_data.csv')
df_import

Unnamed: 0,ID,Status,Location,City,Country,Age,Gender,Date_onset,Date_confirmation,Symptoms,...,Travel_history_location,Travel_history_country,Genomics_Metadata,Confirmation_method,Source,Source_II,Date_entry,Date_last_modified,Source_III,Country_ISO3
0,1,confirmed,Guy's and St Thomas Hospital London,London,England,,,2022-04-29,2022-05-06,rash,...,Lagos and Delta States,Nigeria,West African Clade,RT-PCR,https://www.gov.uk/government/news/monkeypox-c...,https://www.who.int/emergencies/disease-outbre...,2022-05-18,2022-05-18,,GBR
1,2,confirmed,Guy's and St Thomas Hospital London,London,England,,,2022-05-05,2022-05-12,rash,...,,,West African Clade,RT-PCR,https://www.gov.uk/government/news/monkeypox-c...,,2022-05-18,2022-05-18,,GBR
2,3,confirmed,London,London,England,,,2022-04-30,2022-05-13,vesicular rash,...,,,West African Clade,RT-PCR,https://www.gov.uk/government/news/monkeypox-c...,,2022-05-18,2022-05-18,,GBR
3,4,confirmed,London,London,England,,male,,2022-05-15,vesicular rash,...,,,West African Clade,,https://www.gov.uk/government/news/monkeypox-c...,,2022-05-18,2022-05-18,,GBR
4,5,confirmed,London,London,England,,male,,2022-05-15,vesicular rash,...,,,West African Clade,,https://www.gov.uk/government/news/monkeypox-c...,,2022-05-18,2022-05-18,,GBR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1001,confirmed,,,Slovenia,,,,2022-06-02,,...,,,,,https://www.nijz.si/sl/okuzbe-z-virusom-opicji...,https://english.sta.si/3044830/third-case-of-m...,2022-06-02,2022-06-02,,SVN
1001,1002,suspected,,,Uruguay,,,,,,...,,,,,https://www.gub.uy/ministerio-salud-publica/co...,,2022-06-02,2022-06-02,,URY
1002,1003,suspected,,,Uruguay,,,,,,...,,,,,https://www.gub.uy/ministerio-salud-publica/co...,,2022-06-02,2022-06-02,,URY
1003,1004,suspected,,,Uruguay,,,,,,...,,,,,https://www.gub.uy/ministerio-salud-publica/co...,,2022-06-02,2022-06-02,,URY


### Data Cleaning

In [230]:
df_import.columns

Index(['ID', 'Status', 'Location', 'City', 'Country', 'Age', 'Gender',
       'Date_onset', 'Date_confirmation', 'Symptoms', 'Hospitalised (Y/N/NA)',
       'Date_hospitalisation', 'Isolated (Y/N/NA)', 'Date_isolation',
       'Outcome', 'Contact_comment', 'Contact_ID', 'Contact_location',
       'Travel_history (Y/N/NA)', 'Travel_history_entry',
       'Travel_history_start', 'Travel_history_location',
       'Travel_history_country', 'Genomics_Metadata', 'Confirmation_method',
       'Source', 'Source_II', 'Date_entry', 'Date_last_modified', 'Source_III',
       'Country_ISO3'],
      dtype='object')

In [231]:
df = df_import[['ID','Date_confirmation', 'Location', 'City', 'Country','Status', 'Symptoms']]
df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms
0,1,2022-05-06,Guy's and St Thomas Hospital London,London,England,confirmed,rash
1,2,2022-05-12,Guy's and St Thomas Hospital London,London,England,confirmed,rash
2,3,2022-05-13,London,London,England,confirmed,vesicular rash
3,4,2022-05-15,London,London,England,confirmed,vesicular rash
4,5,2022-05-15,London,London,England,confirmed,vesicular rash
...,...,...,...,...,...,...,...
1000,1001,2022-06-02,,,Slovenia,confirmed,
1001,1002,,,,Uruguay,suspected,
1002,1003,,,,Uruguay,suspected,
1003,1004,,,,Uruguay,suspected,


In [232]:
# I want only value which is confirmed. In this dataset, only the confirmed case has date. So, I just drop na in the Date_confirmation column.
only_con_df = df[df["Date_confirmation"].notna()]

In [233]:
# remove SettingWithCopyWarning: from showing
pd.options.mode.chained_assignment = None

In [234]:
only_con_df['case'] = only_con_df.loc[:, 'Status']
only_con_df['case'] = only_con_df['case'].replace('confirmed', '1').astype('int')
only_con_df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms,case
0,1,2022-05-06,Guy's and St Thomas Hospital London,London,England,confirmed,rash,1
1,2,2022-05-12,Guy's and St Thomas Hospital London,London,England,confirmed,rash,1
2,3,2022-05-13,London,London,England,confirmed,vesicular rash,1
3,4,2022-05-15,London,London,England,confirmed,vesicular rash,1
4,5,2022-05-15,London,London,England,confirmed,vesicular rash,1
...,...,...,...,...,...,...,...,...
993,994,2022-06-02,Los Angeles County,,United States,confirmed,,1
994,995,2022-06-02,Philadelphia; Pennsylvania,Philadelphia,United States,confirmed,,1
995,996,2022-06-02,Chicago; Illinois,Chicago,United States,confirmed,,1
998,999,2022-06-02,Hospital District of Helsinki and Uusimaa,Helsinki,Finland,confirmed,,1


In [235]:
only_con_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 782 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ID                 782 non-null    int64 
 1   Date_confirmation  782 non-null    object
 2   Location           306 non-null    object
 3   City               244 non-null    object
 4   Country            782 non-null    object
 5   Status             782 non-null    object
 6   Symptoms           90 non-null     object
 7   case               782 non-null    int32 
dtypes: int32(1), int64(1), object(6)
memory usage: 51.9+ KB


In [236]:
only_con_df['Date_confirmation'] = pd.to_datetime(only_con_df['Date_confirmation'])
only_con_df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms,case
0,1,2022-05-06,Guy's and St Thomas Hospital London,London,England,confirmed,rash,1
1,2,2022-05-12,Guy's and St Thomas Hospital London,London,England,confirmed,rash,1
2,3,2022-05-13,London,London,England,confirmed,vesicular rash,1
3,4,2022-05-15,London,London,England,confirmed,vesicular rash,1
4,5,2022-05-15,London,London,England,confirmed,vesicular rash,1
...,...,...,...,...,...,...,...,...
993,994,2022-06-02,Los Angeles County,,United States,confirmed,,1
994,995,2022-06-02,Philadelphia; Pennsylvania,Philadelphia,United States,confirmed,,1
995,996,2022-06-02,Chicago; Illinois,Chicago,United States,confirmed,,1
998,999,2022-06-02,Hospital District of Helsinki and Uusimaa,Helsinki,Finland,confirmed,,1


In [237]:
Total_case = only_con_df.groupby('Country', as_index=False).agg(Date_confirmation=('Date_confirmation', 'last'),case=('case',
                                                           lambda x: sum(x == x.iloc[-1])))

Total_case

Unnamed: 0,Country,Date_confirmation,case
0,Argentina,2022-05-27,2
1,Australia,2022-06-02,3
2,Austria,2022-05-23,1
3,Belgium,2022-06-01,14
4,Canada,2022-06-02,57
5,Czech Republic,2022-06-02,6
6,Denmark,2022-05-24,2
7,England,2022-06-02,199
8,Finland,2022-06-02,2
9,France,2022-06-01,33


In [238]:
# sum_case = []
# counter = 0


# for i in only_con_df["Country"]:
# 	if 'England' in i:
# 		counter += 1
# 	sum_case.append(counter)

In [239]:
sym_df = df[df["Date_confirmation"].notna()]

sym_df = (sym_df.set_index('Country')['Symptoms']
       .str.split(';', expand=True)
       .stack()
       .rename('Country')
       .reset_index(name='Symptoms'))
print(sym_df)

count_sym_df = sym_df.groupby(['Country','Symptoms']).size().reset_index(name='Symptoms_count')
count_sym_df

            Country  level_1        Symptoms
0           England        0            rash
1           England        0            rash
2           England        0  vesicular rash
3           England        0  vesicular rash
4           England        0  vesicular rash
..              ...      ...             ...
119  Czech Republic        0           Fever
120  Czech Republic        1          chills
121  Czech Republic        2         fatigue
122  Czech Republic        3        headache
123  Czech Republic        4    skin lesions

[124 rows x 3 columns]


Unnamed: 0,Country,Symptoms,Symptoms_count
0,Argentina,fever,1
1,Argentina,pustules,1
2,Argentina,ulcerative lesions,1
3,Australia,genital rash,1
4,Austria,lesions,1
5,Austria,fever,1
6,Belgium,inguinal adenopathy,1
7,Belgium,perianal papules,1
8,Canada,fever,17
9,Canada,genital ulcer lesions,27


In [240]:
count_sym_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Country         35 non-null     object
 1   Symptoms        35 non-null     object
 2   Symptoms_count  35 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 968.0+ bytes


In [241]:
sort_df = only_con_df.sort_values(by=['Country','Date_confirmation'])
sort_df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms,case
193,194,2022-05-27,,Buenos Aires,Argentina,confirmed,pustules; fever,1
446,447,2022-05-27,Buenos Aires,,Argentina,confirmed,ulcerative lesions,1
106,107,2022-05-20,Sydney,Sydney,Australia,confirmed,,1
107,108,2022-05-20,The Alfred Hospital,Melbourne,Australia,confirmed,genital rash,1
950,951,2022-06-02,New South Wales,Sydney,Australia,confirmed,,1
...,...,...,...,...,...,...,...,...
778,779,2022-05-31,New York,,United States,confirmed,,1
993,994,2022-06-02,Los Angeles County,,United States,confirmed,,1
994,995,2022-06-02,Philadelphia; Pennsylvania,Philadelphia,United States,confirmed,,1
995,996,2022-06-02,Chicago; Illinois,Chicago,United States,confirmed,,1


In [242]:
sort_df['sum_case'] = sort_df.groupby((sort_df['Country'] != sort_df['Country'].shift(1)).cumsum()).cumcount()+1
sort_df

Unnamed: 0,ID,Date_confirmation,Location,City,Country,Status,Symptoms,case,sum_case
193,194,2022-05-27,,Buenos Aires,Argentina,confirmed,pustules; fever,1,1
446,447,2022-05-27,Buenos Aires,,Argentina,confirmed,ulcerative lesions,1,2
106,107,2022-05-20,Sydney,Sydney,Australia,confirmed,,1,1
107,108,2022-05-20,The Alfred Hospital,Melbourne,Australia,confirmed,genital rash,1,2
950,951,2022-06-02,New South Wales,Sydney,Australia,confirmed,,1,3
...,...,...,...,...,...,...,...,...,...
778,779,2022-05-31,New York,,United States,confirmed,,1,19
993,994,2022-06-02,Los Angeles County,,United States,confirmed,,1,20
994,995,2022-06-02,Philadelphia; Pennsylvania,Philadelphia,United States,confirmed,,1,21
995,996,2022-06-02,Chicago; Illinois,Chicago,United States,confirmed,,1,22


### Visualization

In [243]:
fig = px.bar(Total_case.sort_values('case', ascending=False), x='Country', y='case', text='case')
fig.show()

In [244]:
fig2 = px.scatter(sort_df, x='Date_confirmation',y='sum_case', color='Country')
fig2.show()

In [245]:
fig2_2 = px.line(sort_df, x='Date_confirmation',y='sum_case', color='Country')
fig2_2.show()

In [246]:
fig3 = px.line(sort_df[sort_df['Country'] == 'England'], x='Date_confirmation',y='sum_case', color='Country')
fig3.show()

In [247]:
fig4 = px.scatter(count_sym_df.sort_values('Symptoms_count', ascending=False), x='Country', y='Symptoms')
fig4.show()

In [248]:
fig5 = px.bar(count_sym_df, x="Symptoms", y="Symptoms_count",
                color='Country',
                labels={'Symptoms_count':'Symptoms Occurrences'}, height=800)
fig5.show()