##  Importing necessary datasets and packages


In [1]:
#importing the required packages

import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import re

pd.set_option('display.precision', 2)

In [2]:
#reading data from csv file to A dataframe
columns = ['date', 'time', 'latitude', 'longitude', 'magnitude', 'remarks', 'epicenter' ]
main_df = pd.read_csv('Datasets/earthquake_raw.csv')
main_df.columns = columns

## Data Preprocessing


In [3]:
#getting rid of all leading and trailing whitespaces and lowering the alphabets
main_df['epicenter'] = main_df['epicenter'].apply(lambda x: x.strip().lower())

In [4]:
#datatype_conversion
main_df['latitude'] = pd.to_numeric(main_df['latitude'])
main_df['longitude'] = pd.to_numeric(main_df['longitude'])
main_df['magnitude'] = pd.to_numeric(main_df['magnitude'])



In [5]:
def seperate(date):
    """
    Takes in the date(which is in concatenated format of both AD and BS) and return only the
    AD date
    """
    
    pattern = r'A\.D:(.*)'
    match = re.search(pattern, date)
    if match:
        result = match.group(1)
        return result
    return date

def extract_time(timestamp):
    """
    Removes the timezone information and extract only the hour and minute data from the string timestamp.
    """
    pattern = r'Local:(\d{2}:\d{2}) UTC:\d{2}:\d{2}'
    match = re.search(pattern, timestamp)
    if match:
        local_time = match.group(1)
        return local_time
    return timestamp
    

In [6]:
#converts the date column to datetime format

main_df['date'] = main_df['date'].apply(seperate)
main_df['time'] = main_df['time'].apply(extract_time)
main_df['date'] = pd.to_datetime(main_df['date'])


In [7]:
#exporting the cleaned data to a csv file for further analysis

main_df.to_csv("Datasets/earthquake_clean.csv", index=False)

## Exploratory Data Analysis


In [8]:
#checking if there's any null values
main_df.isna().any()

date         False
time         False
latitude     False
longitude    False
magnitude    False
remarks      False
epicenter    False
dtype: bool

In [9]:
##checking if there's any duplicates
main_df.duplicated().any()

False

In [10]:
# check the data type 
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       740 non-null    datetime64[ns]
 1   time       740 non-null    object        
 2   latitude   740 non-null    float64       
 3   longitude  740 non-null    float64       
 4   magnitude  740 non-null    float64       
 5   remarks    740 non-null    object        
 6   epicenter  740 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(3)
memory usage: 40.6+ KB


In [11]:
#lets quickly run the basic descriptive statistics on magnitude column
main_df['magnitude'].describe()

count    740.00
mean       4.40
std        0.47
min        4.00
25%        4.10
50%        4.20
75%        4.60
max        7.60
Name: magnitude, dtype: float64

In [12]:
#740 earthquake were felt in period of 8 years and the average magnitude is 4.4 with standard deviation 0.47
#median is magnitude of 4.2 and it is suggested that the 3rd quartile is 4.6.

In [13]:
#lets check how many earthquakes are above 5 magnitude

In [30]:
filter = main_df['magnitude'] > 5
over_five = main_df[filter]
len(over_five)

69

In [52]:
#lets check the frequency of magnitude over five for every year 
over_five_grouped = over_five.groupby(over_five['date'].dt.year).size()
over_five_grouped

date
2015    42
2016     4
2017     2
2019     3
2020     2
2021     5
2022     7
2023     4
dtype: int64

In [50]:
#lets categorize the earthquake based on magnitude, earthquakes below 4 are type 1, earthquake between 4 and 5 are type 2
#and earthquakes above 5 are type three

categorised_eq = main_df.copy()

magnitude_bins = [4,4.5, 5, 5.5, float('inf')]
magnitude_labels = ['type1', 'type2', 'type3', 'type4']
categorised_eq['type'] = pd.cut(categorised_eq['magnitude'], bins=magnitude_bins, labels=magnitude_labels, right=False)
ceq = categorised_eq.groupby('type').size()

In [76]:
#frequency of all magnitude by year
freq_by_year = categorised_eq.groupby([categorised_eq['date'].dt.year, 'type']).size().reset_index(name='frequency')


<h1>Visualization</h1>

In [None]:
#visualizing earthquake frequency by year

In [66]:
new_names = {'type1': '<4.5',
            'type2': '>=4.5 and <5',
            'type3': '>=5 and <5.5',
            'type4': '>=5.5'}



In [67]:
fig = px.bar(freq_by_year, 'date', 'frequency', color='type',
            title='Stacked bar chart of earthquake type by year')
fig.update_layout(barmode='stack',
                 title_x=0.5,
                 margin=dict(l=50, r=50, t=25, b=50))

fig.for_each_trace(lambda t: t.update(name = new_names[t.name],
                                     legendgroup = new_names[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, new_names[t.name])
                                     )
                  )

In [68]:
fig = px.density_mapbox(main_df,
                        lat='latitude',
                        lon='longitude',
                        mapbox_style='stamen-terrain',
                        radius=5,
                        center={'lat': 28.23, 'lon': 83.64},
                       title="Earthquake Density Map in Nepal(2015 and after)",
                       zoom = 5.5,
                       labels={'z': 'Earthquake Density'},
                        color_continuous_scale='Viridis',
                        opacity=0.8,
                       )


In [75]:
fig.update_layout(
    coloraxis_colorbar=dict(
        title="Earthquake Density",
        tickvals=[0, 0.25, 0.5,0.75, 1],  # Adjust tick values as needed
        ticktext=["Low", "Moderately low", "High","moderately high", "Very High"],  # Adjust labels as needed
    ),
    title_x = 0.5,
    margin = dict(l=20, r=0, t=30, b=0)
    
)

In [72]:
fig.show()