
# Construction of Time Series Data

The data for the natural disaster occurrence can be found at [emdat.be](https://www.emdat.be). We have extracted a univariate time series data for yearly occurrence

In [7]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd
import numpy as np

# Pyspark modules
from pyspark.sql import SparkSession

In [8]:
# Create spark session
spark = SparkSession.builder.appName("project").getOrCreate()

# Load raw data

In [10]:
# load data
base_path = ' '
df = spark.read.csv(base_path + '/natural-disaster/data/emdat_public_cleaned.csv', inferSchema = True, header = True) 
df.createOrReplaceTempView("raw_table") # create table for sql query

# print data size and display sample
print("Data size:", (df.count(), len(df.columns)))
cols = ['year', 'continent', 'disaster_type', 'no_occurrence']
df.select(cols).show()

Data size: (15901, 20)
+----+---------+-------------------+-------------+
|year|continent|      disaster_type|no_occurrence|
+----+---------+-------------------+-------------+
|1900|   Africa|            Drought|          7.0|
|1900|     Asia|            Drought|          7.0|
|1902| Americas|         Earthquake|         10.0|
|1902| Americas|  Volcanic activity|         10.0|
|1902| Americas|  Volcanic activity|         10.0|
|1903| Americas|Mass movement (dry)|         12.0|
|1903|   Africa|  Volcanic activity|         12.0|
|1904|     Asia|              Storm|          4.0|
|1905| Americas|Mass movement (dry)|          8.0|
|1905|     Asia|         Earthquake|          8.0|
|1906| Americas|         Earthquake|         13.0|
|1906| Americas|         Earthquake|         13.0|
|1906|   Europe|              Flood|         13.0|
|1906|   Europe|              Flood|         13.0|
|1906|     Asia|              Storm|         13.0|
|1907|     Asia|         Earthquake|          3.0|
|1907|  

In [21]:
df_sel = spark.sql("select year, count(*) as tot_occurrence from raw_table group by 1")
df_sel = df_sel.toPandas()
df_sel

Unnamed: 0,year,tot_occurrence
0,1959,33
1,1990,303
2,1903,12
3,1975,67
4,1977,141
...,...,...
117,1929,5
118,1928,17
119,1933,11
120,2021,101


In [22]:
# save data
df_sel.to_csv('../data/ts_yearly.csv', index = False)