In [1]:
#download the required built-in modules
import pandas as pd
import numpy as np
import csv
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

## Data Extraction and Transformation

### File-1 (df_1.csv) - Patricia

In [2]:
# Extract the file df_1.csv
disease_outbreaks_df = pd.read_csv(r'./Resources/df_1.csv')
disease_outbreaks_df

Unnamed: 0.1,Unnamed: 0,Rank,Epidemics/pandemics,Disease,Death toll,Global population lost,Regional population lost,Date,Location
0,0,1,Black Death,Bubonic plague,75–200 million,17–54%[a],30–60% of European population[4],1346–1353,"Europe, Asia, and North Africa"
1,1,2,Spanish flu,Influenza A/H1N1,17–100 million,1–5.4%[5][6],–,1918–1920,Worldwide
2,2,3,Plague of Justinian,Bubonic plague,15–100 million,7–56%[a],25–60% of European population[7],541–549,"North Africa, Europe and West Asia"
3,3,4,HIV/AIDS global pandemic,HIV/AIDS,40.1 million (as of 2021),[b],–,1981–present,Worldwide
4,4,5,COVID-19 pandemic,COVID-19,7–28 million (as of November 2022),0.1–0.4%[2],–,2019[c]–present,Worldwide
5,5,6,Third plague pandemic,Bubonic plague,12–15 million,[b],–,1855–1960,Worldwide
6,6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,5–15 million,1–3%[a],27–80% of Mexican population[9],1545–1548,Mexico
7,7,8,Antonine Plague,Smallpox or measles,5–10 million,3–6%[3],25–33% of Roman population[10],165–180 (possibly up to 190),Roman Empire
8,8,9,1520 Mexico smallpox epidemic,Smallpox,5–8 million,1–2%[a],23–37% of Mexican population[9],1519–1520,Mexico
9,9,10,1918–1922 Russia typhus epidemic,Typhus,2–3 million,0.1–0.16%[6][d],1–1.6% of Russian population[11],1918–1922,Russia


In [3]:
disease_outbreaks_df.columns

Index(['Unnamed: 0', 'Rank', 'Epidemics/pandemics', 'Disease', 'Death toll',
       'Global population lost', 'Regional population lost', 'Date',
       'Location'],
      dtype='object')

In [4]:
disease_outbreaks_df = disease_outbreaks_df[['Rank', 'Epidemics/pandemics', 'Disease', 'Death toll',
       'Global population lost', 'Regional population lost', 'Date',
       'Location']]
disease_outbreaks_df

Unnamed: 0,Rank,Epidemics/pandemics,Disease,Death toll,Global population lost,Regional population lost,Date,Location
0,1,Black Death,Bubonic plague,75–200 million,17–54%[a],30–60% of European population[4],1346–1353,"Europe, Asia, and North Africa"
1,2,Spanish flu,Influenza A/H1N1,17–100 million,1–5.4%[5][6],–,1918–1920,Worldwide
2,3,Plague of Justinian,Bubonic plague,15–100 million,7–56%[a],25–60% of European population[7],541–549,"North Africa, Europe and West Asia"
3,4,HIV/AIDS global pandemic,HIV/AIDS,40.1 million (as of 2021),[b],–,1981–present,Worldwide
4,5,COVID-19 pandemic,COVID-19,7–28 million (as of November 2022),0.1–0.4%[2],–,2019[c]–present,Worldwide
5,6,Third plague pandemic,Bubonic plague,12–15 million,[b],–,1855–1960,Worldwide
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,5–15 million,1–3%[a],27–80% of Mexican population[9],1545–1548,Mexico
7,8,Antonine Plague,Smallpox or measles,5–10 million,3–6%[3],25–33% of Roman population[10],165–180 (possibly up to 190),Roman Empire
8,9,1520 Mexico smallpox epidemic,Smallpox,5–8 million,1–2%[a],23–37% of Mexican population[9],1519–1520,Mexico
9,10,1918–1922 Russia typhus epidemic,Typhus,2–3 million,0.1–0.16%[6][d],1–1.6% of Russian population[11],1918–1922,Russia


In [5]:
def replace_millions(string):
    million_index = string.index("million")
    new_string = string[:million_index]
    new_string = new_string.strip()
    return new_string

def get_min_max(alist):
    if len(alist) == 1:
        return alist * 2
    return alist

In [6]:
disease_outbreaks_df_copy = disease_outbreaks_df.copy()
# remove from the millions downwards
disease_outbreaks_df_copy['Death toll'] = disease_outbreaks_df_copy['Death toll'].apply(replace_millions)
# split the column by the dash
disease_outbreaks_df_copy['Death toll'] = disease_outbreaks_df_copy['Death toll'].str.split("–")
# get the max and the min
disease_outbreaks_df_copy['Death toll'] = disease_outbreaks_df_copy['Death toll'].apply(get_min_max)
# transform them to a dataframe
death_droll = disease_outbreaks_df_copy['Death toll'].apply(pd.Series)
death_droll.columns = ["minimum_dealth_toll", "maximum_death_toll"]
death_droll

Unnamed: 0,minimum_dealth_toll,maximum_death_toll
0,75.0,200.0
1,17.0,100.0
2,15.0,100.0
3,40.1,40.1
4,7.0,28.0
5,12.0,15.0
6,5.0,15.0
7,5.0,10.0
8,5.0,8.0
9,2.0,3.0


In [7]:
cleaned_df = pd.concat([disease_outbreaks_df_copy, death_droll], axis=1)
cleaned_df=cleaned_df.drop("Death toll", axis=1)
cleaned_df

Unnamed: 0,Rank,Epidemics/pandemics,Disease,Global population lost,Regional population lost,Date,Location,minimum_dealth_toll,maximum_death_toll
0,1,Black Death,Bubonic plague,17–54%[a],30–60% of European population[4],1346–1353,"Europe, Asia, and North Africa",75.0,200.0
1,2,Spanish flu,Influenza A/H1N1,1–5.4%[5][6],–,1918–1920,Worldwide,17.0,100.0
2,3,Plague of Justinian,Bubonic plague,7–56%[a],25–60% of European population[7],541–549,"North Africa, Europe and West Asia",15.0,100.0
3,4,HIV/AIDS global pandemic,HIV/AIDS,[b],–,1981–present,Worldwide,40.1,40.1
4,5,COVID-19 pandemic,COVID-19,0.1–0.4%[2],–,2019[c]–present,Worldwide,7.0,28.0
5,6,Third plague pandemic,Bubonic plague,[b],–,1855–1960,Worldwide,12.0,15.0
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,1–3%[a],27–80% of Mexican population[9],1545–1548,Mexico,5.0,15.0
7,8,Antonine Plague,Smallpox or measles,3–6%[3],25–33% of Roman population[10],165–180 (possibly up to 190),Roman Empire,5.0,10.0
8,9,1520 Mexico smallpox epidemic,Smallpox,1–2%[a],23–37% of Mexican population[9],1519–1520,Mexico,5.0,8.0
9,10,1918–1922 Russia typhus epidemic,Typhus,0.1–0.16%[6][d],1–1.6% of Russian population[11],1918–1922,Russia,2.0,3.0


In [8]:
def average(lst):
    return sum(lst)/len(lst)

In [9]:
lst = [3,4]
avg = average(lst)
avg

3.5

In [10]:
cleaned_df['minimum_dealth_toll'] = pd.to_numeric(cleaned_df['minimum_dealth_toll'])
cleaned_df['maximum_death_toll'] = pd.to_numeric(cleaned_df['maximum_death_toll'])

In [11]:
cleaned_df['average_death_toll'] = cleaned_df[['minimum_dealth_toll', 'maximum_death_toll']].mean(axis=1)
cleaned_df

Unnamed: 0,Rank,Epidemics/pandemics,Disease,Global population lost,Regional population lost,Date,Location,minimum_dealth_toll,maximum_death_toll,average_death_toll
0,1,Black Death,Bubonic plague,17–54%[a],30–60% of European population[4],1346–1353,"Europe, Asia, and North Africa",75.0,200.0,137.5
1,2,Spanish flu,Influenza A/H1N1,1–5.4%[5][6],–,1918–1920,Worldwide,17.0,100.0,58.5
2,3,Plague of Justinian,Bubonic plague,7–56%[a],25–60% of European population[7],541–549,"North Africa, Europe and West Asia",15.0,100.0,57.5
3,4,HIV/AIDS global pandemic,HIV/AIDS,[b],–,1981–present,Worldwide,40.1,40.1,40.1
4,5,COVID-19 pandemic,COVID-19,0.1–0.4%[2],–,2019[c]–present,Worldwide,7.0,28.0,17.5
5,6,Third plague pandemic,Bubonic plague,[b],–,1855–1960,Worldwide,12.0,15.0,13.5
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,1–3%[a],27–80% of Mexican population[9],1545–1548,Mexico,5.0,15.0,10.0
7,8,Antonine Plague,Smallpox or measles,3–6%[3],25–33% of Roman population[10],165–180 (possibly up to 190),Roman Empire,5.0,10.0,7.5
8,9,1520 Mexico smallpox epidemic,Smallpox,1–2%[a],23–37% of Mexican population[9],1519–1520,Mexico,5.0,8.0,6.5
9,10,1918–1922 Russia typhus epidemic,Typhus,0.1–0.16%[6][d],1–1.6% of Russian population[11],1918–1922,Russia,2.0,3.0,2.5


In [12]:
def replace_perc(string):
    try:
        perct_index = string.index("%")
        new_string = string[:perct_index]
        new_string = new_string.strip()
        return new_string
    except:
        return "unknown"

def get_min_max(alist):
    if len(alist) == 1:
        return alist * 2
    return alist

In [13]:
replace_perc('17–54%[a]')

'17–54'

In [14]:
result = cleaned_df.dtypes

print("Output:")
print(result)

Output:
Rank                          int64
Epidemics/pandemics          object
Disease                      object
Global population lost       object
Regional population lost     object
Date                         object
Location                     object
minimum_dealth_toll         float64
maximum_death_toll          float64
average_death_toll          float64
dtype: object


In [15]:
cleaned_df['Global population lost']=cleaned_df['Global population lost'].map(str)

In [16]:
result = cleaned_df.dtypes

print("Output:")
print(result)

Output:
Rank                          int64
Epidemics/pandemics          object
Disease                      object
Global population lost       object
Regional population lost     object
Date                         object
Location                     object
minimum_dealth_toll         float64
maximum_death_toll          float64
average_death_toll          float64
dtype: object


In [17]:
cleaned_df_copy = cleaned_df.copy()
cleaned_df_copy['Global population lost'] = cleaned_df_copy['Global population lost'].apply(replace_perc)
# split the column by the dash
cleaned_df_copy['Global population lost'] = cleaned_df_copy['Global population lost'].str.split("–")
# get the max and the min
cleaned_df_copy['Global population lost'] = cleaned_df_copy['Global population lost'].apply(get_min_max)
# transform them to a dataframe
pop_lost = cleaned_df_copy['Global population lost'].apply(pd.Series)
pop_lost.columns = ["minimum_population_lost", "maximum_population_lost"]
pop_lost

Unnamed: 0,minimum_population_lost,maximum_population_lost
0,17,54
1,1,5.4
2,7,56
3,unknown,unknown
4,0.1,0.4
5,unknown,unknown
6,1,3
7,3,6
8,1,2
9,0.1,0.16


In [18]:
cleaned_df2 = pd.concat([cleaned_df_copy, pop_lost], axis=1)
cleaned_df2=cleaned_df2.drop("Global population lost", axis=1)
cleaned_df2

Unnamed: 0,Rank,Epidemics/pandemics,Disease,Regional population lost,Date,Location,minimum_dealth_toll,maximum_death_toll,average_death_toll,minimum_population_lost,maximum_population_lost
0,1,Black Death,Bubonic plague,30–60% of European population[4],1346–1353,"Europe, Asia, and North Africa",75.0,200.0,137.5,17,54
1,2,Spanish flu,Influenza A/H1N1,–,1918–1920,Worldwide,17.0,100.0,58.5,1,5.4
2,3,Plague of Justinian,Bubonic plague,25–60% of European population[7],541–549,"North Africa, Europe and West Asia",15.0,100.0,57.5,7,56
3,4,HIV/AIDS global pandemic,HIV/AIDS,–,1981–present,Worldwide,40.1,40.1,40.1,unknown,unknown
4,5,COVID-19 pandemic,COVID-19,–,2019[c]–present,Worldwide,7.0,28.0,17.5,0.1,0.4
5,6,Third plague pandemic,Bubonic plague,–,1855–1960,Worldwide,12.0,15.0,13.5,unknown,unknown
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,27–80% of Mexican population[9],1545–1548,Mexico,5.0,15.0,10.0,1,3
7,8,Antonine Plague,Smallpox or measles,25–33% of Roman population[10],165–180 (possibly up to 190),Roman Empire,5.0,10.0,7.5,3,6
8,9,1520 Mexico smallpox epidemic,Smallpox,23–37% of Mexican population[9],1519–1520,Mexico,5.0,8.0,6.5,1,2
9,10,1918–1922 Russia typhus epidemic,Typhus,1–1.6% of Russian population[11],1918–1922,Russia,2.0,3.0,2.5,0.1,0.16


In [19]:
def replace_perc(string):
    try:
        perct_index = string.index("%")
        new_string = string[:perct_index]
        new_string = new_string.strip()
        return new_string
    except:
        return "unknown"

def get_min_max(alist):
    if len(alist) == 1:
        return alist * 2
    return alist

In [20]:
cleaned_df_copy2 = cleaned_df2.copy()
cleaned_df_copy2['Regional population lost'] = cleaned_df_copy2['Regional population lost'].apply(replace_perc)
# split the column by the dash
cleaned_df_copy2['Regional population lost'] = cleaned_df_copy2['Regional population lost'].str.split("–")
# get the max and the min
cleaned_df_copy2['Regional population lost'] = cleaned_df_copy2['Regional population lost'].apply(get_min_max)
# transform them to a dataframe
pop_lost2 = cleaned_df_copy2['Regional population lost'].apply(pd.Series)
pop_lost2.columns = ["minimum_reg_population_lost", "maximum_reg_population_lost"]
pop_lost2

Unnamed: 0,minimum_reg_population_lost,maximum_reg_population_lost
0,30,60
1,unknown,unknown
2,25,60
3,unknown,unknown
4,unknown,unknown
5,unknown,unknown
6,27,80
7,25,33
8,23,37
9,1,1.6


In [21]:
cleaned_df3 = pd.concat([cleaned_df_copy2, pop_lost2], axis=1)
cleaned_df3=cleaned_df3.drop("Regional population lost", axis=1)
cleaned_df3

Unnamed: 0,Rank,Epidemics/pandemics,Disease,Date,Location,minimum_dealth_toll,maximum_death_toll,average_death_toll,minimum_population_lost,maximum_population_lost,minimum_reg_population_lost,maximum_reg_population_lost
0,1,Black Death,Bubonic plague,1346–1353,"Europe, Asia, and North Africa",75.0,200.0,137.5,17,54,30,60
1,2,Spanish flu,Influenza A/H1N1,1918–1920,Worldwide,17.0,100.0,58.5,1,5.4,unknown,unknown
2,3,Plague of Justinian,Bubonic plague,541–549,"North Africa, Europe and West Asia",15.0,100.0,57.5,7,56,25,60
3,4,HIV/AIDS global pandemic,HIV/AIDS,1981–present,Worldwide,40.1,40.1,40.1,unknown,unknown,unknown,unknown
4,5,COVID-19 pandemic,COVID-19,2019[c]–present,Worldwide,7.0,28.0,17.5,0.1,0.4,unknown,unknown
5,6,Third plague pandemic,Bubonic plague,1855–1960,Worldwide,12.0,15.0,13.5,unknown,unknown,unknown,unknown
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,1545–1548,Mexico,5.0,15.0,10.0,1,3,27,80
7,8,Antonine Plague,Smallpox or measles,165–180 (possibly up to 190),Roman Empire,5.0,10.0,7.5,3,6,25,33
8,9,1520 Mexico smallpox epidemic,Smallpox,1519–1520,Mexico,5.0,8.0,6.5,1,2,23,37
9,10,1918–1922 Russia typhus epidemic,Typhus,1918–1922,Russia,2.0,3.0,2.5,0.1,0.16,1,1.6


In [22]:
cleaned_df3['Date'] = cleaned_df3['Date'].str.replace('present','2022')
cleaned_df3['Date'] = cleaned_df3['Date'].str.replace('[c]','')

cleaned_df

  cleaned_df3['Date'] = cleaned_df3['Date'].str.replace('[c]','')


Unnamed: 0,Rank,Epidemics/pandemics,Disease,Global population lost,Regional population lost,Date,Location,minimum_dealth_toll,maximum_death_toll,average_death_toll
0,1,Black Death,Bubonic plague,17–54%[a],30–60% of European population[4],1346–1353,"Europe, Asia, and North Africa",75.0,200.0,137.5
1,2,Spanish flu,Influenza A/H1N1,1–5.4%[5][6],–,1918–1920,Worldwide,17.0,100.0,58.5
2,3,Plague of Justinian,Bubonic plague,7–56%[a],25–60% of European population[7],541–549,"North Africa, Europe and West Asia",15.0,100.0,57.5
3,4,HIV/AIDS global pandemic,HIV/AIDS,[b],–,1981–present,Worldwide,40.1,40.1,40.1
4,5,COVID-19 pandemic,COVID-19,0.1–0.4%[2],–,2019[c]–present,Worldwide,7.0,28.0,17.5
5,6,Third plague pandemic,Bubonic plague,[b],–,1855–1960,Worldwide,12.0,15.0,13.5
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,1–3%[a],27–80% of Mexican population[9],1545–1548,Mexico,5.0,15.0,10.0
7,8,Antonine Plague,Smallpox or measles,3–6%[3],25–33% of Roman population[10],165–180 (possibly up to 190),Roman Empire,5.0,10.0,7.5
8,9,1520 Mexico smallpox epidemic,Smallpox,1–2%[a],23–37% of Mexican population[9],1519–1520,Mexico,5.0,8.0,6.5
9,10,1918–1922 Russia typhus epidemic,Typhus,0.1–0.16%[6][d],1–1.6% of Russian population[11],1918–1922,Russia,2.0,3.0,2.5


In [23]:
def replace_comments(string):
    try:
        perct_index = string.index("(")
        new_string = string[:perct_index]
        new_string = new_string.strip()
        return new_string
    except:
        return string
    


    

def get_min_max(alist):
    if len(alist) == 1:
        return alist * 2
    return alist

In [24]:
cleaned_df3

Unnamed: 0,Rank,Epidemics/pandemics,Disease,Date,Location,minimum_dealth_toll,maximum_death_toll,average_death_toll,minimum_population_lost,maximum_population_lost,minimum_reg_population_lost,maximum_reg_population_lost
0,1,Black Death,Bubonic plague,1346–1353,"Europe, Asia, and North Africa",75.0,200.0,137.5,17,54,30,60
1,2,Spanish flu,Influenza A/H1N1,1918–1920,Worldwide,17.0,100.0,58.5,1,5.4,unknown,unknown
2,3,Plague of Justinian,Bubonic plague,541–549,"North Africa, Europe and West Asia",15.0,100.0,57.5,7,56,25,60
3,4,HIV/AIDS global pandemic,HIV/AIDS,1981–2022,Worldwide,40.1,40.1,40.1,unknown,unknown,unknown,unknown
4,5,COVID-19 pandemic,COVID-19,2019[]–2022,Worldwide,7.0,28.0,17.5,0.1,0.4,unknown,unknown
5,6,Third plague pandemic,Bubonic plague,1855–1960,Worldwide,12.0,15.0,13.5,unknown,unknown,unknown,unknown
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,1545–1548,Mexico,5.0,15.0,10.0,1,3,27,80
7,8,Antonine Plague,Smallpox or measles,165–180 (possibly up to 190),Roman Empire,5.0,10.0,7.5,3,6,25,33
8,9,1520 Mexico smallpox epidemic,Smallpox,1519–1520,Mexico,5.0,8.0,6.5,1,2,23,37
9,10,1918–1922 Russia typhus epidemic,Typhus,1918–1922,Russia,2.0,3.0,2.5,0.1,0.16,1,1.6


In [25]:
cleaned_df3['Date'] = cleaned_df3['Date'].str.replace('−','–')

In [26]:
cleaned_df_copy3 = cleaned_df3.copy()
cleaned_df_copy3['Date'] = cleaned_df_copy3['Date'].apply(replace_comments)
# split the column by the dash
cleaned_df_copy3['Date'] = cleaned_df_copy3['Date'].str.split("–")
# get the max and the min
cleaned_df_copy3['Date'] = cleaned_df_copy3['Date'].apply(get_min_max)
# transform them to a dataframe
years = cleaned_df_copy3['Date'].apply(pd.Series)
years.columns = ["start_date", "finish_date"]
years

Unnamed: 0,start_date,finish_date
0,1346,1353
1,1918,1920
2,541,549
3,1981,2022
4,2019[],2022
5,1855,1960
6,1545,1548
7,165,180
8,1519,1520
9,1918,1922


In [27]:
years["start_date"] = years["start_date"].str.replace("\W", "")
years["finish_date"] = years["finish_date"].str.replace("\W", "")
years

  years["start_date"] = years["start_date"].str.replace("\W", "")
  years["finish_date"] = years["finish_date"].str.replace("\W", "")


Unnamed: 0,start_date,finish_date
0,1346,1353
1,1918,1920
2,541,549
3,1981,2022
4,2019,2022
5,1855,1960
6,1545,1548
7,165,180
8,1519,1520
9,1918,1922


In [28]:
cleaned_df4 = pd.concat([cleaned_df_copy3, years], axis=1)
cleaned_df4=cleaned_df4.drop("Date", axis=1)
cleaned_df4

Unnamed: 0,Rank,Epidemics/pandemics,Disease,Location,minimum_dealth_toll,maximum_death_toll,average_death_toll,minimum_population_lost,maximum_population_lost,minimum_reg_population_lost,maximum_reg_population_lost,start_date,finish_date
0,1,Black Death,Bubonic plague,"Europe, Asia, and North Africa",75.0,200.0,137.5,17,54,30,60,1346,1353
1,2,Spanish flu,Influenza A/H1N1,Worldwide,17.0,100.0,58.5,1,5.4,unknown,unknown,1918,1920
2,3,Plague of Justinian,Bubonic plague,"North Africa, Europe and West Asia",15.0,100.0,57.5,7,56,25,60,541,549
3,4,HIV/AIDS global pandemic,HIV/AIDS,Worldwide,40.1,40.1,40.1,unknown,unknown,unknown,unknown,1981,2022
4,5,COVID-19 pandemic,COVID-19,Worldwide,7.0,28.0,17.5,0.1,0.4,unknown,unknown,2019,2022
5,6,Third plague pandemic,Bubonic plague,Worldwide,12.0,15.0,13.5,unknown,unknown,unknown,unknown,1855,1960
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,Mexico,5.0,15.0,10.0,1,3,27,80,1545,1548
7,8,Antonine Plague,Smallpox or measles,Roman Empire,5.0,10.0,7.5,3,6,25,33,165,180
8,9,1520 Mexico smallpox epidemic,Smallpox,Mexico,5.0,8.0,6.5,1,2,23,37,1519,1520
9,10,1918–1922 Russia typhus epidemic,Typhus,Russia,2.0,3.0,2.5,0.1,0.16,1,1.6,1918,1922


In [29]:
result = cleaned_df4.dtypes

print("Output:")
print(result)

Output:
Rank                             int64
Epidemics/pandemics             object
Disease                         object
Location                        object
minimum_dealth_toll            float64
maximum_death_toll             float64
average_death_toll             float64
minimum_population_lost         object
maximum_population_lost         object
minimum_reg_population_lost     object
maximum_reg_population_lost     object
start_date                      object
finish_date                     object
dtype: object


In [30]:
cleaned_df4["start_date"] = pd.to_numeric(cleaned_df4["start_date"])
cleaned_df4["finish_date"] = pd.to_numeric(cleaned_df4["finish_date"])

In [31]:
cleaned_df4['Duration (years)'] = cleaned_df4["finish_date"] - cleaned_df4["start_date"]

In [32]:
cleaned_df4

Unnamed: 0,Rank,Epidemics/pandemics,Disease,Location,minimum_dealth_toll,maximum_death_toll,average_death_toll,minimum_population_lost,maximum_population_lost,minimum_reg_population_lost,maximum_reg_population_lost,start_date,finish_date,Duration (years)
0,1,Black Death,Bubonic plague,"Europe, Asia, and North Africa",75.0,200.0,137.5,17,54,30,60,1346,1353,7
1,2,Spanish flu,Influenza A/H1N1,Worldwide,17.0,100.0,58.5,1,5.4,unknown,unknown,1918,1920,2
2,3,Plague of Justinian,Bubonic plague,"North Africa, Europe and West Asia",15.0,100.0,57.5,7,56,25,60,541,549,8
3,4,HIV/AIDS global pandemic,HIV/AIDS,Worldwide,40.1,40.1,40.1,unknown,unknown,unknown,unknown,1981,2022,41
4,5,COVID-19 pandemic,COVID-19,Worldwide,7.0,28.0,17.5,0.1,0.4,unknown,unknown,2019,2022,3
5,6,Third plague pandemic,Bubonic plague,Worldwide,12.0,15.0,13.5,unknown,unknown,unknown,unknown,1855,1960,105
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,Mexico,5.0,15.0,10.0,1,3,27,80,1545,1548,3
7,8,Antonine Plague,Smallpox or measles,Roman Empire,5.0,10.0,7.5,3,6,25,33,165,180,15
8,9,1520 Mexico smallpox epidemic,Smallpox,Mexico,5.0,8.0,6.5,1,2,23,37,1519,1520,1
9,10,1918–1922 Russia typhus epidemic,Typhus,Russia,2.0,3.0,2.5,0.1,0.16,1,1.6,1918,1922,4


In [33]:
cleaned_df4 = cleaned_df4.rename(columns={'minimum_dealth_toll': 'Minimum Death Toll', 'maximum_death_toll': 'Maximum Death Toll',
                                        'average_death_toll':'Average Death Toll','minimum_reg_population_lost':'Minimum Regional Population Lost',
                                        'maximum_reg_population_lost':'Maximum Regional Population Lost' ,
                                        'start_date':'Year Pandemic Started','finish_date':'Year Pandemic Finished' })

In [34]:
disease_df = cleaned_df4
disease_df

Unnamed: 0,Rank,Epidemics/pandemics,Disease,Location,Minimum Death Toll,Maximum Death Toll,Average Death Toll,minimum_population_lost,maximum_population_lost,Minimum Regional Population Lost,Maximum Regional Population Lost,Year Pandemic Started,Year Pandemic Finished,Duration (years)
0,1,Black Death,Bubonic plague,"Europe, Asia, and North Africa",75.0,200.0,137.5,17,54,30,60,1346,1353,7
1,2,Spanish flu,Influenza A/H1N1,Worldwide,17.0,100.0,58.5,1,5.4,unknown,unknown,1918,1920,2
2,3,Plague of Justinian,Bubonic plague,"North Africa, Europe and West Asia",15.0,100.0,57.5,7,56,25,60,541,549,8
3,4,HIV/AIDS global pandemic,HIV/AIDS,Worldwide,40.1,40.1,40.1,unknown,unknown,unknown,unknown,1981,2022,41
4,5,COVID-19 pandemic,COVID-19,Worldwide,7.0,28.0,17.5,0.1,0.4,unknown,unknown,2019,2022,3
5,6,Third plague pandemic,Bubonic plague,Worldwide,12.0,15.0,13.5,unknown,unknown,unknown,unknown,1855,1960,105
6,7,Cocoliztli epidemic of 1545–1548,Cocoliztli,Mexico,5.0,15.0,10.0,1,3,27,80,1545,1548,3
7,8,Antonine Plague,Smallpox or measles,Roman Empire,5.0,10.0,7.5,3,6,25,33,165,180,15
8,9,1520 Mexico smallpox epidemic,Smallpox,Mexico,5.0,8.0,6.5,1,2,23,37,1519,1520,1
9,10,1918–1922 Russia typhus epidemic,Typhus,Russia,2.0,3.0,2.5,0.1,0.16,1,1.6,1918,1922,4


In [35]:
major_outbreaks = disease_df[["Rank", "Epidemics/pandemics", "minimum_population_lost", "maximum_population_lost", "Minimum Regional Population Lost","Maximum Regional Population Lost", "Duration (years)"]]
major_outbreaks = major_outbreaks.rename(columns={'Rank': 'rank_id',
                                                  'Epidemics/pandemics': 'events',
                                                  'minimum_population_lost': 'min_global_population_lost_percent',
                                                  'maximum_population_lost': 'max_global_population_lost_percent',
                                                  'Minimum Regional Population Lost': 'min_regional_population_lost_percent',
                                                  'Maximum Regional Population Lost': 'max_regional_population_lost_percent',
                                                  'Duration (years)': 'duration_years'})

major_outbreaks

Unnamed: 0,rank_id,events,min_global_population_lost_percent,max_global_population_lost_percent,min_regional_population_lost_percent,max_regional_population_lost_percent,duration_years
0,1,Black Death,17,54,30,60,7
1,2,Spanish flu,1,5.4,unknown,unknown,2
2,3,Plague of Justinian,7,56,25,60,8
3,4,HIV/AIDS global pandemic,unknown,unknown,unknown,unknown,41
4,5,COVID-19 pandemic,0.1,0.4,unknown,unknown,3
5,6,Third plague pandemic,unknown,unknown,unknown,unknown,105
6,7,Cocoliztli epidemic of 1545–1548,1,3,27,80,3
7,8,Antonine Plague,3,6,25,33,15
8,9,1520 Mexico smallpox epidemic,1,2,23,37,1
9,10,1918–1922 Russia typhus epidemic,0.1,0.16,1,1.6,4


### File-2 (df_2.csv) - Nandhini

In [36]:
#download the required csv file
df_2 = pd.read_csv("./Resources/df_2.csv")
df_2

Unnamed: 0.1,Unnamed: 0,Event,Date,Location,Disease,Death toll (estimate),Ref.
0,0,1350 BC plague of Megiddo,1350 BC,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown,[25]
1,1,Plague of Athens,429–426 BC,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000",[26][27][28][29]
2,2,412 BC epidemic,412 BC,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown,[30]
3,3,Antonine Plague,165–180 (possibly up to 190),Roman Empire,"Unknown, possibly smallpox",5–10 million,[31][32]
4,4,Jian'an Plague,217,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown,[33][34]
...,...,...,...,...,...,...,...
248,248,2020 Nigeria yellow fever epidemic,2020,Nigeria,Yellow fever,296 (as of 31 December 2020),[306]
249,249,2021 India black fungus epidemic,2021–present,India,Black fungus / COVID-19 associated mucormycosis,4332,[307]
250,250,2022 hepatitis of unknown origin in children,2021–present,Worldwide,Hepatitis by Adenovirus variant AF41 (Unconfir...,18,[308][309][310]
251,251,2022 monkeypox outbreak,2022–present,Worldwide,Monkeypox virus,136,[311][312][313][314]


In [37]:
# Select the required columns
df_2 = df_2[["Event", "Date", "Location", "Disease", "Death toll (estimate)"]]
df_2

Unnamed: 0,Event,Date,Location,Disease,Death toll (estimate)
0,1350 BC plague of Megiddo,1350 BC,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown
1,Plague of Athens,429–426 BC,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000"
2,412 BC epidemic,412 BC,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown
3,Antonine Plague,165–180 (possibly up to 190),Roman Empire,"Unknown, possibly smallpox",5–10 million
4,Jian'an Plague,217,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown
...,...,...,...,...,...
248,2020 Nigeria yellow fever epidemic,2020,Nigeria,Yellow fever,296 (as of 31 December 2020)
249,2021 India black fungus epidemic,2021–present,India,Black fungus / COVID-19 associated mucormycosis,4332
250,2022 hepatitis of unknown origin in children,2021–present,Worldwide,Hepatitis by Adenovirus variant AF41 (Unconfir...,18
251,2022 monkeypox outbreak,2022–present,Worldwide,Monkeypox virus,136


In [38]:
# Copy the Pandemic names from the file1, so the data on both the files match. 
# This will help creating primary/foreign keys in sql.
df2 = pd.merge(df_2,disease_outbreaks_df[["Date","Epidemics/pandemics"]],on="Date",how="left")
df2

Unnamed: 0,Event,Date,Location,Disease,Death toll (estimate),Epidemics/pandemics
0,1350 BC plague of Megiddo,1350 BC,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown,
1,Plague of Athens,429–426 BC,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000",
2,412 BC epidemic,412 BC,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown,
3,Antonine Plague,165–180 (possibly up to 190),Roman Empire,"Unknown, possibly smallpox",5–10 million,Antonine Plague
4,Jian'an Plague,217,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown,
...,...,...,...,...,...,...
248,2020 Nigeria yellow fever epidemic,2020,Nigeria,Yellow fever,296 (as of 31 December 2020),
249,2021 India black fungus epidemic,2021–present,India,Black fungus / COVID-19 associated mucormycosis,4332,
250,2022 hepatitis of unknown origin in children,2021–present,Worldwide,Hepatitis by Adenovirus variant AF41 (Unconfir...,18,
251,2022 monkeypox outbreak,2022–present,Worldwide,Monkeypox virus,136,


In [39]:
df2["Event"] = np.where(df2["Epidemics/pandemics"].isnull(), df2["Event"], df2["Epidemics/pandemics"])
df2

Unnamed: 0,Event,Date,Location,Disease,Death toll (estimate),Epidemics/pandemics
0,1350 BC plague of Megiddo,1350 BC,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown,
1,Plague of Athens,429–426 BC,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000",
2,412 BC epidemic,412 BC,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown,
3,Antonine Plague,165–180 (possibly up to 190),Roman Empire,"Unknown, possibly smallpox",5–10 million,Antonine Plague
4,Jian'an Plague,217,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown,
...,...,...,...,...,...,...
248,2020 Nigeria yellow fever epidemic,2020,Nigeria,Yellow fever,296 (as of 31 December 2020),
249,2021 India black fungus epidemic,2021–present,India,Black fungus / COVID-19 associated mucormycosis,4332,
250,2022 hepatitis of unknown origin in children,2021–present,Worldwide,Hepatitis by Adenovirus variant AF41 (Unconfir...,18,
251,2022 monkeypox outbreak,2022–present,Worldwide,Monkeypox virus,136,


In [40]:
df2 = df2.drop("Epidemics/pandemics",axis=1)
df2 = df2.drop_duplicates(["Event"],keep="last")
df2 = df2.reset_index(drop=True)
df2

Unnamed: 0,Event,Date,Location,Disease,Death toll (estimate)
0,1350 BC plague of Megiddo,1350 BC,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown
1,Plague of Athens,429–426 BC,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000"
2,412 BC epidemic,412 BC,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown
3,Antonine Plague,165–180 (possibly up to 190),Roman Empire,"Unknown, possibly smallpox",5–10 million
4,Jian'an Plague,217,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown
...,...,...,...,...,...
247,2020 Nigeria yellow fever epidemic,2020,Nigeria,Yellow fever,296 (as of 31 December 2020)
248,2021 India black fungus epidemic,2021–present,India,Black fungus / COVID-19 associated mucormycosis,4332
249,2022 hepatitis of unknown origin in children,2021–present,Worldwide,Hepatitis by Adenovirus variant AF41 (Unconfir...,18
250,2022 monkeypox outbreak,2022–present,Worldwide,Monkeypox virus,136


**Clean-up the Date column as follows:**

    - Split the column into 2: Start Date and End Date
    
    - Create a column to capture the BC and AD details
    
    - Remove the unnecessary text
    
    - Replace the value "present" with 9999. This helps with dataload as this is a numeric field.

In [41]:
# Define the functions as required

# Function to capture start and end values
def get_start_end(alist):
    if len(alist) == 1:
        return alist * 2
    return alist

# Function to capture the BC and AD details
def get_bc_ad(dlist):
    if len(dlist) > 1:
        if dlist[1] != "BC":
            dlist[1] = "AD"
            return dlist
        else:
            return dlist
    else:
        dlist.append("AD")
        return dlist

In [42]:
# Split the values in Date column by hyphen
df2["Date"] = df2['Date'].str.replace('−','–')
df2["Date"] = df2["Date"].str.split("–")

# Capture the start and end values using the function get_start_end
df2["Date"] = df2["Date"].apply(get_start_end)

# Store both the start and end values into a list
epidemic_date = df2["Date"].apply(pd.Series)
epidemic_date.columns = ["Start_Date", "End_Date"]
epidemic_date

Unnamed: 0,Start_Date,End_Date
0,1350 BC,1350 BC
1,429,426 BC
2,412 BC,412 BC
3,165,180 (possibly up to 190)
4,217,217
...,...,...
247,2020,2020
248,2021,present
249,2021,present
250,2022,present


In [43]:
# Capture only the year values from Start Date, removing all other text
epidemic_date["Start_Date"] = epidemic_date["Start_Date"].str.split(" ").str[0]

# Capture only the year values from End Date
epidemic_date["End_Date"] = epidemic_date["End_Date"].str.split(" ",1)

# Capture the BC and AD values, using the function get_bc_ad function defined above
epidemic_date["End_Date"] = epidemic_date["End_Date"].apply(get_bc_ad)

# Store the End date and BC/AD details in a list
epidemic_period = epidemic_date["End_Date"].apply(pd.Series)
epidemic_period.columns = ["End_Date", "BC_AD"]

# Replace the string "present" with 9999
epidemic_period["End_Date"] = epidemic_period["End_Date"].replace("present",9999)

epidemic_period

Unnamed: 0,End_Date,BC_AD
0,1350,BC
1,426,BC
2,412,BC
3,180,AD
4,217,AD
...,...,...
247,2020,AD
248,9999,AD
249,9999,AD
250,9999,AD


In [44]:
# Drop the unnecessary columns from the lists created above and merge as required
epidemic_date = epidemic_date.drop("End_Date",axis=1)

epidemic_date_cleaned = pd.concat([epidemic_date,epidemic_period],axis=1)

# Create a dataframe to capture the clean date values
df2_cleaned = pd.concat([df2, epidemic_date_cleaned],axis=1)

# Drop the original Date column
df2_cleaned = df2_cleaned.drop("Date",axis=1)
df2_cleaned

Unnamed: 0,Event,Location,Disease,Death toll (estimate),Start_Date,End_Date,BC_AD
0,1350 BC plague of Megiddo,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown,1350,1350,BC
1,Plague of Athens,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000",429,426,BC
2,412 BC epidemic,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown,412,412,BC
3,Antonine Plague,Roman Empire,"Unknown, possibly smallpox",5–10 million,165,180,AD
4,Jian'an Plague,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown,217,217,AD
...,...,...,...,...,...,...,...
247,2020 Nigeria yellow fever epidemic,Nigeria,Yellow fever,296 (as of 31 December 2020),2020,2020,AD
248,2021 India black fungus epidemic,India,Black fungus / COVID-19 associated mucormycosis,4332,2021,9999,AD
249,2022 hepatitis of unknown origin in children,Worldwide,Hepatitis by Adenovirus variant AF41 (Unconfir...,18,2021,9999,AD
250,2022 monkeypox outbreak,Worldwide,Monkeypox virus,136,2022,9999,AD


**Clean-up the "Death toll (estimate)" column as follows:**

    - Capture all data in this column into a new column as Comments, so the existing comments in this column aren't lost

    - Remove the word million and capture million in numbers (eg., 2 million captured as 2,000,000)

    - Remove the commas and other formatting from the numbers

    - Split the column to capture the Minimum and Maximum values separately

    - Replace the string "Unknown" by blanks. This helps with dataload as this is an integer field.

In [45]:
# Rename the Death toll column (this is done so comments aren't lost in ETL)
df2_cleaned.rename(columns = {"Death toll (estimate)":"Comments Death toll (estimate)"}, inplace = True)
df2_cleaned

Unnamed: 0,Event,Location,Disease,Comments Death toll (estimate),Start_Date,End_Date,BC_AD
0,1350 BC plague of Megiddo,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown,1350,1350,BC
1,Plague of Athens,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000",429,426,BC
2,412 BC epidemic,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown,412,412,BC
3,Antonine Plague,Roman Empire,"Unknown, possibly smallpox",5–10 million,165,180,AD
4,Jian'an Plague,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown,217,217,AD
...,...,...,...,...,...,...,...
247,2020 Nigeria yellow fever epidemic,Nigeria,Yellow fever,296 (as of 31 December 2020),2020,2020,AD
248,2021 India black fungus epidemic,India,Black fungus / COVID-19 associated mucormycosis,4332,2021,9999,AD
249,2022 hepatitis of unknown origin in children,Worldwide,Hepatitis by Adenovirus variant AF41 (Unconfir...,18,2021,9999,AD
250,2022 monkeypox outbreak,Worldwide,Monkeypox virus,136,2022,9999,AD


In [46]:
# Define the functions as required

# Function to replace the word million with actual value and to remove commas from the numbers
def replace_millions(string):
    
    # find the word million; if not pressent, remove the commas from the number and return it as such    
    try:
        million_index = string.index("million")
    except:
        return (string.replace(",",""))

    # if million is found, check if the string has one or 2 values (using hyphen)
    else:
        try:
            # if there is a hyphen in the string, it has 2 values. so muliply both the values with 1000000 
            hyphen_index = string.index("–")
        except:
            # if there is no hyphen, then multiply this one value with 1000000
            new_string = string[:million_index]
            new_string = new_string.strip()
            try:
                new_string = int(new_string) * 1000000
            except:
                new_string = float(new_string) * 1000000
            return str(new_string)

        else:
            string1 = string.split("–",1)
            string1[0] = string1[0].strip()
            try:
                string1[0] = int(string1[0]) * 1000000
            except:
                string1[0] = float(string1[0]) * 1000000
                
            
            million_index = string1[1].index("million")
            string1[1] = string1[1][:million_index]
            string1[1] = string1[1].strip()
            try:
                string1[1] = int(string1[1]) * 1000000
            except:
                string1[1] = float(string1[1]) * 1000000
            return (str(string1[0])+"–"+str(string1[1]))

In [47]:
# use the function replace_millions to capture million in numbers
df2_cleaned["Death toll"] = df2_cleaned["Comments Death toll (estimate)"].apply(replace_millions)
df2_cleaned.head(15)

Unnamed: 0,Event,Location,Disease,Comments Death toll (estimate),Start_Date,End_Date,BC_AD,Death toll
0,1350 BC plague of Megiddo,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown,1350,1350,BC,Unknown
1,Plague of Athens,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000",429,426,BC,75000–100000
2,412 BC epidemic,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown,412,412,BC,Unknown
3,Antonine Plague,Roman Empire,"Unknown, possibly smallpox",5–10 million,165,180,AD,5000000–10000000
4,Jian'an Plague,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown,217,217,AD,Unknown
5,Plague of Cyprian,Europe,"Unknown, possibly smallpox",Unknown,250,266,AD,Unknown
6,Plague of Justinian,Europe and West Asia,Bubonic plague,15–100 million,541,549,AD,15000000–100000000
7,Roman Plague of 590 (part of first plague pand...,"Rome, Byzantine Empire",Bubonic plague,Unknown,590,590,AD,Unknown
8,Plague of Sheroe (part of First plague pandemic),Bilad al-Sham,Bubonic plague,"25,000+",627,628,AD,25000+
9,Plague of Amwas (part of first plague pandemic),"Byzantine Empire, West Asia, Africa",Bubonic plague,"25,000+",638,639,AD,25000+


In [48]:
# Split the column to capture the Minimum and Maximum values separately
df2_cleaned["Death toll"] = df2_cleaned["Death toll"].str.split("–",1)

# Capture the min and max values using the function get_start_end
df2_cleaned["Death toll"] = df2_cleaned["Death toll"].apply(get_start_end)

# Store both the min and max values into a list
death_toll = df2_cleaned["Death toll"].apply(pd.Series)
death_toll.columns = ["Min_Death", "Max_Death"]
death_toll.head(15)

Unnamed: 0,Min_Death,Max_Death
0,Unknown,Unknown
1,75000,100000
2,Unknown,Unknown
3,5000000,10000000
4,Unknown,Unknown
5,Unknown,Unknown
6,15000000,100000000
7,Unknown,Unknown
8,25000+,25000+
9,25000+,25000+


In [49]:
# Remove the unneccesary text from the data and replace "Unknown" with null
death_toll["Min_Death_Estimate"]=(death_toll["Min_Death"].str.extract('(\d+)')
                                                         .astype('Int64'))

death_toll["Max_Death_Estimate"]=(death_toll["Max_Death"].str.extract('(\d+)')
                                                         .astype('Int64'))
death_toll.head(15)

Unnamed: 0,Min_Death,Max_Death,Min_Death_Estimate,Max_Death_Estimate
0,Unknown,Unknown,,
1,75000,100000,75000.0,100000.0
2,Unknown,Unknown,,
3,5000000,10000000,5000000.0,10000000.0
4,Unknown,Unknown,,
5,Unknown,Unknown,,
6,15000000,100000000,15000000.0,100000000.0
7,Unknown,Unknown,,
8,25000+,25000+,25000.0,25000.0
9,25000+,25000+,25000.0,25000.0


In [50]:
# merge the cleaned dataframe to the main dataframe
events_details = pd.concat([df2_cleaned, death_toll],axis=1)

# Drop the unnecessary columns
events_details = events_details.drop(columns=["Death toll","Min_Death","Max_Death"],axis=1)

events_details

Unnamed: 0,Event,Location,Disease,Comments Death toll (estimate),Start_Date,End_Date,BC_AD,Min_Death_Estimate,Max_Death_Estimate
0,1350 BC plague of Megiddo,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown,1350,1350,BC,,
1,Plague of Athens,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000",429,426,BC,75000,100000
2,412 BC epidemic,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown,412,412,BC,,
3,Antonine Plague,Roman Empire,"Unknown, possibly smallpox",5–10 million,165,180,AD,5000000,10000000
4,Jian'an Plague,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown,217,217,AD,,
...,...,...,...,...,...,...,...,...,...
247,2020 Nigeria yellow fever epidemic,Nigeria,Yellow fever,296 (as of 31 December 2020),2020,2020,AD,296,296
248,2021 India black fungus epidemic,India,Black fungus / COVID-19 associated mucormycosis,4332,2021,9999,AD,4332,4332
249,2022 hepatitis of unknown origin in children,Worldwide,Hepatitis by Adenovirus variant AF41 (Unconfir...,18,2021,9999,AD,18,18
250,2022 monkeypox outbreak,Worldwide,Monkeypox virus,136,2022,9999,AD,136,136


In [51]:
events_details.rename(columns = {"Comments Death toll (estimate)": "comments_death_toll",
                                 "Event": "events",
                                 "Location": "location",
                                 "Disease": "disease",
                                 "Start_Date": "start_year",
                                 "End_Date": "end_year",
                                 "BC_AD": "bc_ad",
                                 "Min_Death_Estimate": "min_death_toll",
                                 "Max_Death_Estimate": "max_death_toll"}, inplace = True)
events_details

Unnamed: 0,events,location,disease,comments_death_toll,start_year,end_year,bc_ad,min_death_toll,max_death_toll
0,1350 BC plague of Megiddo,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown,1350,1350,BC,,
1,Plague of Athens,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000",429,426,BC,75000,100000
2,412 BC epidemic,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown,412,412,BC,,
3,Antonine Plague,Roman Empire,"Unknown, possibly smallpox",5–10 million,165,180,AD,5000000,10000000
4,Jian'an Plague,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown,217,217,AD,,
...,...,...,...,...,...,...,...,...,...
247,2020 Nigeria yellow fever epidemic,Nigeria,Yellow fever,296 (as of 31 December 2020),2020,2020,AD,296,296
248,2021 India black fungus epidemic,India,Black fungus / COVID-19 associated mucormycosis,4332,2021,9999,AD,4332,4332
249,2022 hepatitis of unknown origin in children,Worldwide,Hepatitis by Adenovirus variant AF41 (Unconfir...,18,2021,9999,AD,18,18
250,2022 monkeypox outbreak,Worldwide,Monkeypox virus,136,2022,9999,AD,136,136


### File-3 (df_4.csv) - Salma

In [52]:
#Store CSV into Dataframe
csv_file = "./Resources/df_4.csv"
disaster_category_df = pd.read_csv(csv_file)
#Show df
disaster_category_df

Unnamed: 0.1,Unnamed: 0,0,1
0,0,vteNatural disasters – list by death tollGeolo...,vteNatural disasters – list by death tollGeolo...
1,1,vteNatural disasters – list by death toll,vteNatural disasters – list by death toll
2,2,Geological,Mass wasting Landslide Avalanche Mudflow Debri...
3,3,Mass wasting,Landslide Avalanche Mudflow Debris flow
4,4,Earthquake (List),Seismic hazard Seismic risk Soil liquefaction
5,5,Volcano eruption,Pyroclastic flow Lahar Volcanic ash
6,6,Natural erosion,Sinkhole
7,7,Hydrological,Flood (List) Coastal flood Flash flood Storm s...
8,8,Flood (List),Coastal flood Flash flood Storm surge
9,9,Other,Tsunami Megatsunami Limnic eruption


In [53]:
#Filter the dataframe for specific columns.
New_disaster_category_df = disaster_category_df[['Unnamed: 0', '0', '1']].copy()

#Print results
New_disaster_category_df

Unnamed: 0.1,Unnamed: 0,0,1
0,0,vteNatural disasters – list by death tollGeolo...,vteNatural disasters – list by death tollGeolo...
1,1,vteNatural disasters – list by death toll,vteNatural disasters – list by death toll
2,2,Geological,Mass wasting Landslide Avalanche Mudflow Debri...
3,3,Mass wasting,Landslide Avalanche Mudflow Debris flow
4,4,Earthquake (List),Seismic hazard Seismic risk Soil liquefaction
5,5,Volcano eruption,Pyroclastic flow Lahar Volcanic ash
6,6,Natural erosion,Sinkhole
7,7,Hydrological,Flood (List) Coastal flood Flash flood Storm s...
8,8,Flood (List),Coastal flood Flash flood Storm surge
9,9,Other,Tsunami Megatsunami Limnic eruption


In [54]:
#Renaming the columns
New_disaster_category_df = New_disaster_category_df.rename(columns={"Unnamed: 0": "Death Toll Rank",
                                                          "0": "Natural Disaster Category",
                                                          "1": "Natural Disaster Subcategory"})


New_disaster_category_df

Unnamed: 0,Death Toll Rank,Natural Disaster Category,Natural Disaster Subcategory
0,0,vteNatural disasters – list by death tollGeolo...,vteNatural disasters – list by death tollGeolo...
1,1,vteNatural disasters – list by death toll,vteNatural disasters – list by death toll
2,2,Geological,Mass wasting Landslide Avalanche Mudflow Debri...
3,3,Mass wasting,Landslide Avalanche Mudflow Debris flow
4,4,Earthquake (List),Seismic hazard Seismic risk Soil liquefaction
5,5,Volcano eruption,Pyroclastic flow Lahar Volcanic ash
6,6,Natural erosion,Sinkhole
7,7,Hydrological,Flood (List) Coastal flood Flash flood Storm s...
8,8,Flood (List),Coastal flood Flash flood Storm surge
9,9,Other,Tsunami Megatsunami Limnic eruption


In [55]:
#Dropping the first 2 rows

New_disaster_category_df.drop([0,1], axis=0, inplace=True)

New_disaster_category_df

Unnamed: 0,Death Toll Rank,Natural Disaster Category,Natural Disaster Subcategory
2,2,Geological,Mass wasting Landslide Avalanche Mudflow Debri...
3,3,Mass wasting,Landslide Avalanche Mudflow Debris flow
4,4,Earthquake (List),Seismic hazard Seismic risk Soil liquefaction
5,5,Volcano eruption,Pyroclastic flow Lahar Volcanic ash
6,6,Natural erosion,Sinkhole
7,7,Hydrological,Flood (List) Coastal flood Flash flood Storm s...
8,8,Flood (List),Coastal flood Flash flood Storm surge
9,9,Other,Tsunami Megatsunami Limnic eruption
10,10,Meteorological,Temperature Blizzard Cold wave Ice storm Hail ...
11,11,Temperature,Blizzard Cold wave Ice storm Hail Heat wave


In [56]:
New_disaster_category_df.columns

Index(['Death Toll Rank', 'Natural Disaster Category',
       'Natural Disaster Subcategory'],
      dtype='object')

In [57]:
New_disaster_category_df["Natural Disaster Subcategory"].iloc[0]

'Mass wasting Landslide Avalanche Mudflow Debris flow Earthquake (List) Seismic hazard Seismic risk Soil liquefaction Volcano eruption Pyroclastic flow Lahar Volcanic ash Natural erosion Sinkhole'

In [58]:
import re

def extract_words(string):
    return re.findall('(?:^|(?<= ))[A-Z][^A-Z]*', string)
# re.findall("[A-Z][^A-Z]*", text)

In [59]:
# dropping null value columns to avoid errors
New_disaster_category_df.dropna(inplace = True)

# new data frame with split value columns
new = New_disaster_category_df["Natural Disaster Subcategory"].apply(extract_words)

# df display
New_disaster_category_df

Unnamed: 0,Death Toll Rank,Natural Disaster Category,Natural Disaster Subcategory
2,2,Geological,Mass wasting Landslide Avalanche Mudflow Debri...
3,3,Mass wasting,Landslide Avalanche Mudflow Debris flow
4,4,Earthquake (List),Seismic hazard Seismic risk Soil liquefaction
5,5,Volcano eruption,Pyroclastic flow Lahar Volcanic ash
6,6,Natural erosion,Sinkhole
7,7,Hydrological,Flood (List) Coastal flood Flash flood Storm s...
8,8,Flood (List),Coastal flood Flash flood Storm surge
9,9,Other,Tsunami Megatsunami Limnic eruption
10,10,Meteorological,Temperature Blizzard Cold wave Ice storm Hail ...
11,11,Temperature,Blizzard Cold wave Ice storm Hail Heat wave


In [60]:
new

2     [Mass wasting , Landslide , Avalanche , Mudflo...
3       [Landslide , Avalanche , Mudflow , Debris flow]
4     [Seismic hazard , Seismic risk , Soil liquefac...
5             [Pyroclastic flow , Lahar , Volcanic ash]
6                                            [Sinkhole]
7     [Flood (, Coastal flood , Flash flood , Storm ...
8           [Coastal flood , Flash flood , Storm surge]
9             [Tsunami , Megatsunami , Limnic eruption]
10    [Temperature , Blizzard , Cold wave , Ice stor...
11    [Blizzard , Cold wave , Ice storm , Hail , Hea...
12                                        [Megadrought]
13          [Thunderstorm , Tornado , Tropical cyclone]
14                           [Wildfire , Firestorm , A]
15    [Potentially hazardous object , Impact event ,...
16    [Accidents, Rail , Maritime , Shipwreck , Air ...
17    [Transport , Rail , Maritime , Shipwreck , Air...
18    [Rail , Maritime , Shipwreck , Air , Spaceflight]
19    [Structural failures and collapses , Bridg

In [61]:
New1_disaster_category_df = pd.concat([New_disaster_category_df, new.apply(pd.Series)], axis=1)

In [62]:
New1_disaster_category_df

Unnamed: 0,Death Toll Rank,Natural Disaster Category,Natural Disaster Subcategory,0,1,2,3,4,5,6,...,24,25,26,27,28,29,30,31,32,33
2,2,Geological,Mass wasting Landslide Avalanche Mudflow Debri...,Mass wasting,Landslide,Avalanche,Mudflow,Debris flow,Earthquake (,Seismic hazard,...,,,,,,,,,,
3,3,Mass wasting,Landslide Avalanche Mudflow Debris flow,Landslide,Avalanche,Mudflow,Debris flow,,,,...,,,,,,,,,,
4,4,Earthquake (List),Seismic hazard Seismic risk Soil liquefaction,Seismic hazard,Seismic risk,Soil liquefaction,,,,,...,,,,,,,,,,
5,5,Volcano eruption,Pyroclastic flow Lahar Volcanic ash,Pyroclastic flow,Lahar,Volcanic ash,,,,,...,,,,,,,,,,
6,6,Natural erosion,Sinkhole,Sinkhole,,,,,,,...,,,,,,,,,,
7,7,Hydrological,Flood (List) Coastal flood Flash flood Storm s...,Flood (,Coastal flood,Flash flood,Storm surge,Other,Tsunami,Megatsunami,...,,,,,,,,,,
8,8,Flood (List),Coastal flood Flash flood Storm surge,Coastal flood,Flash flood,Storm surge,,,,,...,,,,,,,,,,
9,9,Other,Tsunami Megatsunami Limnic eruption,Tsunami,Megatsunami,Limnic eruption,,,,,...,,,,,,,,,,
10,10,Meteorological,Temperature Blizzard Cold wave Ice storm Hail ...,Temperature,Blizzard,Cold wave,Ice storm,Hail,Heat wave,Drought,...,,,,,,,,,,
11,11,Temperature,Blizzard Cold wave Ice storm Hail Heat wave,Blizzard,Cold wave,Ice storm,Hail,Heat wave,,,...,,,,,,,,,,


In [63]:
New1_disaster_category_df.drop('Natural Disaster Subcategory', inplace=True, axis=1)

In [64]:
New1_disaster_category_df

Unnamed: 0,Death Toll Rank,Natural Disaster Category,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
2,2,Geological,Mass wasting,Landslide,Avalanche,Mudflow,Debris flow,Earthquake (,Seismic hazard,Seismic risk,...,,,,,,,,,,
3,3,Mass wasting,Landslide,Avalanche,Mudflow,Debris flow,,,,,...,,,,,,,,,,
4,4,Earthquake (List),Seismic hazard,Seismic risk,Soil liquefaction,,,,,,...,,,,,,,,,,
5,5,Volcano eruption,Pyroclastic flow,Lahar,Volcanic ash,,,,,,...,,,,,,,,,,
6,6,Natural erosion,Sinkhole,,,,,,,,...,,,,,,,,,,
7,7,Hydrological,Flood (,Coastal flood,Flash flood,Storm surge,Other,Tsunami,Megatsunami,Limnic eruption,...,,,,,,,,,,
8,8,Flood (List),Coastal flood,Flash flood,Storm surge,,,,,,...,,,,,,,,,,
9,9,Other,Tsunami,Megatsunami,Limnic eruption,,,,,,...,,,,,,,,,,
10,10,Meteorological,Temperature,Blizzard,Cold wave,Ice storm,Hail,Heat wave,Drought,Megadrought,...,,,,,,,,,,
11,11,Temperature,Blizzard,Cold wave,Ice storm,Hail,Heat wave,,,,...,,,,,,,,,,


In [65]:
natural_disasters = New1_disaster_category_df
natural_disasters.rename(columns = {"Death Toll Rank": "death_toll_rank_id",
                                    "Natural Disaster Category": "natural_disaster_category",
                                    0: 'sc0', 
                                    1: 'sc1', 
                                    2 : 'sc2', 
                                    3 : 'sc3', 
                                    4 : 'sc4', 
                                    5 : 'sc5', 
                                    6 : 'sc6', 
                                    7 : 'sc7', 
                                    8 : 'sc8', 
                                    9 : 'sc9', 
                                    10 : 'sc10', 
                                    11 : 'sc11', 
                                    12 : 'sc12', 
                                    13 : 'sc13', 
                                    14 : 'sc14', 
                                    15 : 'sc15', 
                                    16 : 'sc16', 
                                    17 : 'sc17', 
                                    18 : 'sc18', 
                                    19 : 'sc19', 
                                    20 : 'sc20', 
                                    21 : 'sc21', 
                                    22 : 'sc22', 
                                    23 : 'sc23', 
                                    24 : 'sc24', 
                                    25 : 'sc25', 
                                    26 : 'sc26', 
                                    27 : 'sc27', 
                                    28 : 'sc28', 
                                    29 : 'sc29', 
                                    30 : 'sc30', 
                                    31 : 'sc31', 
                                    32 : 'sc32', 
                                    33 : 'sc33'}, inplace=True)
natural_disasters

Unnamed: 0,death_toll_rank_id,natural_disaster_category,sc0,sc1,sc2,sc3,sc4,sc5,sc6,sc7,...,sc24,sc25,sc26,sc27,sc28,sc29,sc30,sc31,sc32,sc33
2,2,Geological,Mass wasting,Landslide,Avalanche,Mudflow,Debris flow,Earthquake (,Seismic hazard,Seismic risk,...,,,,,,,,,,
3,3,Mass wasting,Landslide,Avalanche,Mudflow,Debris flow,,,,,...,,,,,,,,,,
4,4,Earthquake (List),Seismic hazard,Seismic risk,Soil liquefaction,,,,,,...,,,,,,,,,,
5,5,Volcano eruption,Pyroclastic flow,Lahar,Volcanic ash,,,,,,...,,,,,,,,,,
6,6,Natural erosion,Sinkhole,,,,,,,,...,,,,,,,,,,
7,7,Hydrological,Flood (,Coastal flood,Flash flood,Storm surge,Other,Tsunami,Megatsunami,Limnic eruption,...,,,,,,,,,,
8,8,Flood (List),Coastal flood,Flash flood,Storm surge,,,,,,...,,,,,,,,,,
9,9,Other,Tsunami,Megatsunami,Limnic eruption,,,,,,...,,,,,,,,,,
10,10,Meteorological,Temperature,Blizzard,Cold wave,Ice storm,Hail,Heat wave,Drought,Megadrought,...,,,,,,,,,,
11,11,Temperature,Blizzard,Cold wave,Ice storm,Hail,Heat wave,,,,...,,,,,,,,,,


### File-4 (df_16.csv) - Razvan

In [66]:
df_16 =pd.read_csv('./Resources/df_16.csv')
df_16

Unnamed: 0.1,Unnamed: 0,0,1
0,0,Ancient,Hittite plague (c. 1320–1300 BC) Plague of Ath...
1,1,Post-classical,Plague of Justinian (541–542) Roman Plague (59...
2,2,Early modern,16th century Influenza pandemic (1510) Mexican...
3,3,16th century,Influenza pandemic (1510) Mexican smallpox (15...
4,4,17th century,Maltese plague (1623) Italian plague (1629–163...
5,5,18th century,Great Northern War plague (1710–1712) Great Pl...
6,6,Modern,19th century Ottoman plague (1812–1819) Maltes...
7,7,19th century,Ottoman plague (1812–1819) Maltese plague (181...
8,8,20th century,San Francisco plague (1900–1904) Manchurian pl...
9,9,21st century,SARS (2002–2004) Midwest monkeypox (2003) Bird...


In [67]:
df16 = df_16.rename({'0': 'Period', '1': 'Epidemic'}, axis=1)
df16

Unnamed: 0.1,Unnamed: 0,Period,Epidemic
0,0,Ancient,Hittite plague (c. 1320–1300 BC) Plague of Ath...
1,1,Post-classical,Plague of Justinian (541–542) Roman Plague (59...
2,2,Early modern,16th century Influenza pandemic (1510) Mexican...
3,3,16th century,Influenza pandemic (1510) Mexican smallpox (15...
4,4,17th century,Maltese plague (1623) Italian plague (1629–163...
5,5,18th century,Great Northern War plague (1710–1712) Great Pl...
6,6,Modern,19th century Ottoman plague (1812–1819) Maltes...
7,7,19th century,Ottoman plague (1812–1819) Maltese plague (181...
8,8,20th century,San Francisco plague (1900–1904) Manchurian pl...
9,9,21st century,SARS (2002–2004) Midwest monkeypox (2003) Bird...


In [68]:
df16["Epidemic"][0]

'Hittite plague (c.\u20091320–1300 BC) Plague of Athens (429–426 BC) Antonine Plague (165–180 AD) Plague of Cyprian (250–266)'

In [69]:
new_columns_df = df16["Epidemic"].str.split(")", expand=True)

In [70]:
new_columns_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
0,Hittite plague (c. 1320–1300 BC,Plague of Athens (429–426 BC,Antonine Plague (165–180 AD,Plague of Cyprian (250–266,,,,,,,...,,,,,,,,,,
1,Plague of Justinian (541–542,Roman Plague (590,Plague of Sheroe (627–628,Plague of Amwas (638–639,Plague of 664 (664–689,Japanese smallpox (735–737,Black Death (1346–1353,Sweating sickness (1485–1551,,,...,,,,,,,,,,
2,16th century Influenza pandemic (1510,Mexican smallpox (1520,Influenza pandemic (1557–1559,London plague (1563–1564,Maltese plague (1592–1593,London plague (1592–1593,Atlantic plague (1596–1602,17th century Maltese plague (1623,Italian plague (1629–1631,Massachusetts smallpox (1633,...,,,,,,,,,,
3,Influenza pandemic (1510,Mexican smallpox (1520,Influenza pandemic (1557–1559,London plague (1563–1564,Maltese plague (1592–1593,London plague (1592–1593,Atlantic plague (1596–1602,,,,...,,,,,,,,,,
4,Maltese plague (1623,Italian plague (1629–1631,Massachusetts smallpox (1633,Great Plague in the late Ming dynasty (1633–1644,Great Plague of Seville (1647–1652,Maltese plague (1655,Naples Plague (1656,Great Plague of London (1665–1666,Maltese plague (1675–1676,Great Plague of Vienna (1679,...,,,,,,,,,,
5,Great Northern War plague (1710–1712,Great Plague of Marseille (1720–1722,Great Plague of 1738 (1738,Russian plague (1770–1772,Persian Plague (1772,North American smallpox (1780–1782,Philadelphia yellow fever (1793–1798,,,,...,,,,,,,,,,
6,19th century Ottoman plague (1812–1819,Maltese plague (1813–1814,Caragea's plague (1813,Groningen epidemic (1829,Great Plains smallpox (1837–1838,Typhus (1847–1848,Copenhagen cholera (1853,Stockholm cholera (1853,Broad Street cholera (1854,Guam smallpox (1856,...,Philippine measles (2019–present,Pacific NW measles (2019,New York measles (2019,Kuala Koh measles (2019,Tonga measles (2019,DRC measles (2019–2020,New Zealand measles (2019–2020,Singaporean dengue (2020,Uganda Ebola outbreak (2022,
7,Ottoman plague (1812–1819,Maltese plague (1813–1814,Caragea's plague (1813,Groningen epidemic (1829,Great Plains smallpox (1837–1838,Typhus (1847–1848,Copenhagen cholera (1853,Stockholm cholera (1853,Broad Street cholera (1854,Guam smallpox (1856,...,,,,,,,,,,
8,San Francisco plague (1900–1904,Manchurian plague (1910–1911,LA pneumonic plague (1924,Croydon typhoid (1937,NYC smallpox (1947,Wrocław smallpox (1963,Yugoslav smallpox (1972,London flu (1972–1973,Indian smallpox (1974,Surat plague (1994,...,,,,,,,,,,
9,SARS (2002–2004,Midwest monkeypox (2003,Bird flu (2003–2005,Singaporean dengue (2005,Indian dengue (2006,Chikungunya outbreaks (2006,Pakistani dengue (2006,Iraqi cholera (2007,Zimbabwean cholera (2008–2009,Madagascar plague (2008–2017,...,,,,,,,,,,


In [71]:
#new_columns_df.to_csv("Epidemics.csv", index = False)

In [72]:
period_df = pd.DataFrame().assign(Period=df16['Period'])
print(period_df)

           Period
0         Ancient
1  Post-classical
2    Early modern
3    16th century
4    17th century
5    18th century
6          Modern
7    19th century
8    20th century
9    21st century


In [73]:
#period_df.to_csv("Historic_periods.csv", index = False)

In [74]:
table = pd.concat([period_df, new_columns_df], join='outer', axis = 1, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)
table

Unnamed: 0,Period,0,1,2,3,4,5,6,7,8,...,62,63,64,65,66,67,68,69,70,71
0,Ancient,Hittite plague (c. 1320–1300 BC,Plague of Athens (429–426 BC,Antonine Plague (165–180 AD,Plague of Cyprian (250–266,,,,,,...,,,,,,,,,,
1,Post-classical,Plague of Justinian (541–542,Roman Plague (590,Plague of Sheroe (627–628,Plague of Amwas (638–639,Plague of 664 (664–689,Japanese smallpox (735–737,Black Death (1346–1353,Sweating sickness (1485–1551,,...,,,,,,,,,,
2,Early modern,16th century Influenza pandemic (1510,Mexican smallpox (1520,Influenza pandemic (1557–1559,London plague (1563–1564,Maltese plague (1592–1593,London plague (1592–1593,Atlantic plague (1596–1602,17th century Maltese plague (1623,Italian plague (1629–1631,...,,,,,,,,,,
3,16th century,Influenza pandemic (1510,Mexican smallpox (1520,Influenza pandemic (1557–1559,London plague (1563–1564,Maltese plague (1592–1593,London plague (1592–1593,Atlantic plague (1596–1602,,,...,,,,,,,,,,
4,17th century,Maltese plague (1623,Italian plague (1629–1631,Massachusetts smallpox (1633,Great Plague in the late Ming dynasty (1633–1644,Great Plague of Seville (1647–1652,Maltese plague (1655,Naples Plague (1656,Great Plague of London (1665–1666,Maltese plague (1675–1676,...,,,,,,,,,,
5,18th century,Great Northern War plague (1710–1712,Great Plague of Marseille (1720–1722,Great Plague of 1738 (1738,Russian plague (1770–1772,Persian Plague (1772,North American smallpox (1780–1782,Philadelphia yellow fever (1793–1798,,,...,,,,,,,,,,
6,Modern,19th century Ottoman plague (1812–1819,Maltese plague (1813–1814,Caragea's plague (1813,Groningen epidemic (1829,Great Plains smallpox (1837–1838,Typhus (1847–1848,Copenhagen cholera (1853,Stockholm cholera (1853,Broad Street cholera (1854,...,Philippine measles (2019–present,Pacific NW measles (2019,New York measles (2019,Kuala Koh measles (2019,Tonga measles (2019,DRC measles (2019–2020,New Zealand measles (2019–2020,Singaporean dengue (2020,Uganda Ebola outbreak (2022,
7,19th century,Ottoman plague (1812–1819,Maltese plague (1813–1814,Caragea's plague (1813,Groningen epidemic (1829,Great Plains smallpox (1837–1838,Typhus (1847–1848,Copenhagen cholera (1853,Stockholm cholera (1853,Broad Street cholera (1854,...,,,,,,,,,,
8,20th century,San Francisco plague (1900–1904,Manchurian plague (1910–1911,LA pneumonic plague (1924,Croydon typhoid (1937,NYC smallpox (1947,Wrocław smallpox (1963,Yugoslav smallpox (1972,London flu (1972–1973,Indian smallpox (1974,...,,,,,,,,,,
9,21st century,SARS (2002–2004,Midwest monkeypox (2003,Bird flu (2003–2005,Singaporean dengue (2005,Indian dengue (2006,Chikungunya outbreaks (2006,Pakistani dengue (2006,Iraqi cholera (2007,Zimbabwean cholera (2008–2009,...,,,,,,,,,,


In [75]:
dataframes = []

for index, row in table.iterrows():
    row = row.dropna()
    data = {"Epidemic": []}
    data["Epidemic"] = [r.strip() for r in row[1:] if r!=""]
    df = pd.DataFrame(data)
    df["Period"] = row["Period"]
    dataframes.append(df)
    
dataframe = pd.concat(dataframes, ignore_index=True)
dataframe

Unnamed: 0,Epidemic,Period
0,Hittite plague (c. 1320–1300 BC,Ancient
1,Plague of Athens (429–426 BC,Ancient
2,Antonine Plague (165–180 AD,Ancient
3,Plague of Cyprian (250–266,Ancient
4,Plague of Justinian (541–542,Post-classical
...,...,...
197,Tonga measles (2019,21st century
198,DRC measles (2019–2020,21st century
199,New Zealand measles (2019–2020,21st century
200,Singaporean dengue (2020,21st century


In [76]:
#dataframe.to_csv("Final_table_Razvan.csv", index = False)

In [77]:
#table.to_csv("Epidemics based on historical periods.csv", index = False)

In [78]:
periods = dataframe
periods.rename(columns = {"Epidemic": "epidemic",
                             "Period": "period"}, inplace=True)
periods = periods.drop_duplicates(["epidemic"],keep="last")
periods = periods.reset_index(drop=True)
periods

Unnamed: 0,epidemic,period
0,Hittite plague (c. 1320–1300 BC,Ancient
1,Plague of Athens (429–426 BC,Ancient
2,Antonine Plague (165–180 AD,Ancient
3,Plague of Cyprian (250–266,Ancient
4,Plague of Justinian (541–542,Post-classical
...,...,...
108,Tonga measles (2019,21st century
109,DRC measles (2019–2020,21st century
110,New Zealand measles (2019–2020,21st century
111,Singaporean dengue (2020,21st century


### File-5 (df_24.csv) - Stephen

In [79]:
# Read in the data
df = pd.read_csv('./Resources/df_24.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,vteHistory of medicine,vteHistory of medicine.1,vteHistory of medicine.2
0,0,Timeline of medicine and medical technology,Timeline of medicine and medical technology,Timeline of medicine and medical technology
1,1,Histories of basic sciences,Anatomy Biochemistry Biology Biotechnology Che...,
2,2,Histories of medical specialties,Alternative medicine Anesthesia General Neurax...,
3,3,Medicine in ancient societies,Prehistory Babylon Byzantinia Ancient Egypt Eg...,
4,4,History of methods in medicine,Antibiotics Timeline Blood transfusion Humoris...,


In [80]:
def read_rows(filename):
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            # skip the header
            if row[0].isnumeric():
                yield row


def extract_name(name_string):
    result_list = []
    med_list = name_string.split(' ')

    # Use a for loop to iterate over the list
    temp = ""
    for word in med_list:
        if word[0].isupper():
            # Add the word to the list
            if temp:
                result_list.append(temp)

            temp = word
        else:
            temp += " " + word

    result_list.append(temp)

    return result_list


def main():
    results = []
    for row in read_rows('Resources/df_24.csv'):
        title = row[1]

        name_string = row[2]

        result = []
        result.append(title)

        result += extract_name(name_string)
        results.append(result)

        # print(result)

    return results

results = main()

In [81]:
results
del results[0]

In [82]:
del results[-1]
results

[['Histories of basic sciences',
  'Anatomy',
  'Biochemistry',
  'Biology',
  'Biotechnology',
  'Chemistry',
  'Embryology',
  'Genetics',
  'Immunology',
  'Timeline',
  'Medical diagnosis',
  'Microbiology',
  'Molecular biology',
  'Neuroscience',
  'Nutrition',
  'Pathology',
  'Pharmacology',
  'Physiology',
  'Virology',
  'Viruses'],
 ['Histories of medical specialties',
  'Alternative medicine',
  'Anesthesia',
  'General',
  'Neuraxial',
  'Cancer',
  'Cardiology (invasive and interventional)',
  'Dental treatments',
  'Dermatology',
  'Emergency medicine',
  'CPR',
  'Endocrinology',
  'Neurology',
  'Psychiatry',
  'Timeline',
  'Psychiatric institutions',
  'Psychosurgery',
  'Surgery',
  'Trauma and orthopaedics'],
 ['Medicine in ancient societies',
  'Prehistory',
  'Babylon',
  'Byzantinia',
  'Ancient',
  'Egypt',
  'Egyptian medical papyri',
  'Ancient',
  'Greece',
  'Ancient',
  'Iran',
  'Ancient',
  'Rome',
  'Medieval',
  'Islam',
  'Medieval',
  'Western',
  'E

In [83]:
df = pd.DataFrame (results).transpose()
df.columns = ['Histories of basic sciences', 'Histories of medical specialties', 'Medicine in ancient societies','History of methods in medicine','Disasters and plagues']
print (df)

    Histories of basic sciences          Histories of medical specialties  \
0   Histories of basic sciences          Histories of medical specialties   
1                       Anatomy                      Alternative medicine   
2                  Biochemistry                                Anesthesia   
3                       Biology                                   General   
4                 Biotechnology                                 Neuraxial   
5                     Chemistry                                    Cancer   
6                    Embryology  Cardiology (invasive and interventional)   
7                      Genetics                         Dental treatments   
8                    Immunology                               Dermatology   
9                      Timeline                        Emergency medicine   
10            Medical diagnosis                                       CPR   
11                 Microbiology                             Endocrinology   

In [84]:
df.iloc[0]
df

Unnamed: 0,Histories of basic sciences,Histories of medical specialties,Medicine in ancient societies,History of methods in medicine,Disasters and plagues
0,Histories of basic sciences,Histories of medical specialties,Medicine in ancient societies,History of methods in medicine,Disasters and plagues
1,Anatomy,Alternative medicine,Prehistory,Antibiotics,Black
2,Biochemistry,Anesthesia,Babylon,Timeline,Death
3,Biology,General,Byzantinia,Blood transfusion,List of epidemics
4,Biotechnology,Neuraxial,Ancient,Humorism,Malaria
5,Chemistry,Cancer,Egypt,Neuroimaging,Pandemics
6,Embryology,Cardiology (invasive and interventional),Egyptian medical papyri,Radiation therapy,Plague
7,Genetics,Dental treatments,Ancient,Tracheal intubation,Poliomyelitis
8,Immunology,Dermatology,Greece,Vaccines,Smallpox
9,Timeline,Emergency medicine,Ancient,Timeline,Syphilis


In [85]:
df.iloc[1:, :]

Unnamed: 0,Histories of basic sciences,Histories of medical specialties,Medicine in ancient societies,History of methods in medicine,Disasters and plagues
1,Anatomy,Alternative medicine,Prehistory,Antibiotics,Black
2,Biochemistry,Anesthesia,Babylon,Timeline,Death
3,Biology,General,Byzantinia,Blood transfusion,List of epidemics
4,Biotechnology,Neuraxial,Ancient,Humorism,Malaria
5,Chemistry,Cancer,Egypt,Neuroimaging,Pandemics
6,Embryology,Cardiology (invasive and interventional),Egyptian medical papyri,Radiation therapy,Plague
7,Genetics,Dental treatments,Ancient,Tracheal intubation,Poliomyelitis
8,Immunology,Dermatology,Greece,Vaccines,Smallpox
9,Timeline,Emergency medicine,Ancient,Timeline,Syphilis
10,Medical diagnosis,CPR,Iran,Wound care,Tuberculosis


In [86]:
medical_technology = df
medical_technology.rename(columns = {"Histories of basic sciences": "histories_of_basic_sciences",
                                     "Histories of medical specialties": "histories_of_medical_specialties",
                                    "Medicine in ancient societies": "medicine_in_ancient_societies",
                                    "History of methods in medicine": "history_of_methods_in_medicine",
                                    "Disasters and plagues": "disasters_and_plagues"}, inplace=True)
medical_technology

Unnamed: 0,histories_of_basic_sciences,histories_of_medical_specialties,medicine_in_ancient_societies,history_of_methods_in_medicine,disasters_and_plagues
0,Histories of basic sciences,Histories of medical specialties,Medicine in ancient societies,History of methods in medicine,Disasters and plagues
1,Anatomy,Alternative medicine,Prehistory,Antibiotics,Black
2,Biochemistry,Anesthesia,Babylon,Timeline,Death
3,Biology,General,Byzantinia,Blood transfusion,List of epidemics
4,Biotechnology,Neuraxial,Ancient,Humorism,Malaria
5,Chemistry,Cancer,Egypt,Neuroimaging,Pandemics
6,Embryology,Cardiology (invasive and interventional),Egyptian medical papyri,Radiation therapy,Plague
7,Genetics,Dental treatments,Ancient,Tracheal intubation,Poliomyelitis
8,Immunology,Dermatology,Greece,Vaccines,Smallpox
9,Timeline,Emergency medicine,Ancient,Timeline,Syphilis


# Data Load

In [87]:
# Create a connection string for the Epidemics Database
load_dotenv()

protocol = os.getenv("PROTOCOL")
username = os.getenv("USER_NAME")
password = os.getenv("PASSWORD")
port = os.getenv("PORT")
hostname = os.getenv("HOST")
database = os.getenv("DATABASE")

con_str = f"{protocol}://{username}:{password}@{hostname}:{port}/{database}"

In [88]:
engine = create_engine(con_str)

In [89]:
engine.table_names()

  engine.table_names()


['events_details',
 'major_outbreaks',
 'periods',
 'natural_disasters',
 'medical_technology']

In [90]:
# Table 1 from file df_2
events_details.to_sql(name="events_details", con=engine, if_exists="append", index=False)

252

In [91]:
# Table 2 from file df_1
major_outbreaks.to_sql(name="major_outbreaks", con=engine, if_exists="append", index=False)

19

In [92]:
# Table 3 from file df_16
periods.to_sql(name="periods", con=engine, if_exists="append", index=False)

113

In [93]:
# Table 4 from file df_4
natural_disasters.to_sql(name="natural_disasters", con=engine, if_exists="append", index=False)

20

In [94]:
# Table 5 from file df_24
medical_technology.to_sql(name="medical_technology", con=engine, if_exists="append", index=False)

20