## Explore And Cleaning The Dataset 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import hvplot.pandas
import geoviews as gv
import geopandas as gpd
from scipy.stats import linregress
import seaborn as sns

In [None]:
# Convert the csv file into a Pandas DataFrame
primary_df =pd.read_csv("Resources/Worldwide-Earthquake-database.csv", index_col=0)

# Display sample data
primary_df.head()

In [None]:
#Total number of row and columns 
primary_df.shape

In [None]:
#Printing the columns' name
primary_df.columns

In [None]:
# Get more information about the columns
primary_df.info()

In [None]:
#Narrow down the columns and create a brand new, independent dataframe
worldwide_earthquake_df = primary_df[['FLAG_TSUNAMI', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'FOCAL_DEPTH', 
                                      'EQ_PRIMARY','INTENSITY', 'COUNTRY', 'LATITUDE', 'LONGITUDE',
                                      'TOTAL_DEATHS', 'TOTAL_INJURIES']].copy() 

In [None]:
# Display sample of narrowed down data
worldwide_earthquake_df.head(10)

In [None]:
# Display sample of narrowed down data
worldwide_earthquake_df.tail(10)

In [None]:
#Total number of rows and columns of the new dataframe
worldwide_earthquake_df.shape

In [None]:
# Get information about the DataFrame
worldwide_earthquake_df.info()

#### Handling null values: 
##### we narrowed down the number of columns that have the most number of null values and also we are not using them for our analysis goals.
##### We leave the rest of missing values as is because it is the nature or characteristics of our dataset.
##### Comparing the total number of data (rows) 6193 with the number of Non-Null Count of each column shows that we were able to reduce the number of null values by dropping some columns resonabally.
##### As it is presented in the head and the tail of the dataset the more resent the data is the less missing value is in our dataset, which is because in many years ago there is not much data recorded. 

In [None]:
# Convert 'LATITUDE' and 'LONGITUDE' to numeric values and replace any non-numeric values with NaN
worldwide_earthquake_df['LATITUDE'] = pd.to_numeric(worldwide_earthquake_df['LATITUDE'], errors='coerce')
worldwide_earthquake_df['LONGITUDE'] = pd.to_numeric(worldwide_earthquake_df['LONGITUDE'], errors='coerce')

# Visalisation Of Dataset

In [None]:
# Generate a pie plot showing the percentage of earthquakes trigger tsunamis as well
tsunami_df = worldwide_earthquake_df["FLAG_TSUNAMI"].value_counts()
tsunami_df.plot(kind="pie", autopct="%1.1f%%")
plt.ylabel("FLAG_TSUNAMI")
plt.show()

## Generate a bar chart showing the top 20 countries with the highest earthquake counts

In [None]:
# Count the number of earthquakes per country
earthquake_count_per_country = worldwide_earthquake_df['COUNTRY'].value_counts()

# Select the top 20 countries
top_countries = earthquake_count_per_country.head(20)

# plot the bar chart
top_countries.plot(kind='bar', figsize=(12, 6), color='blue')
plt.title('Top 20 Countries with the Most Earthquakes')
plt.xlabel('Country')
plt.ylabel('Number of Earthquakes')
plt.savefig('Resources/earthquake_bar_chart.png')
plt.show()




## visualising the distribution of earthquakes for the top 20 countries with the highest earthquake counts based on latitude and longitude/ Interactive map using hvplot and geoviews

In [None]:
# Filter DataFrame for the top 20 countries
top_countries_df = worldwide_earthquake_df[worldwide_earthquake_df['COUNTRY'].isin(top_countries.index)]

# Define a GeoDataFrame with the earthquakes and their coordinates
gdf = gpd.GeoDataFrame(top_countries_df, geometry=gpd.points_from_xy(top_countries_df['LONGITUDE'], top_countries_df['LATITUDE']))

# Specify the hover information 
hover_cols = ['COUNTRY', 'FLAG_TSUNAMI', 'YEAR', 'FOCAL_DEPTH', 'EQ_PRIMARY','INTENSITY']

# Plot the map using hvplot and geoviews
map_plot = gdf.hvplot.points(
    geo=True,
    tiles='CartoLight',
    width=700,
    height=500,
    title='Interactive earthquake map for top 20 countries with highest earthquake counts',
    hover_cols=hover_cols)

# Saving the fig
plt.savefig('Resources/interacive_map_top20_count')

# Show the plot
map_plot.opts(
    tools=['hover', 'pan', 'wheel_zoom'],
)

## Generate a bar chart showing the top 20 countries with the highest earthquake Intensity(intensity>=7.5)

In [None]:
# Filter earthquakes with INTENSITY >= 7.5
high_intensity_earthquakes = worldwide_earthquake_df[worldwide_earthquake_df['INTENSITY'] >= 7.5]

# Count the number of earthquakes per country
earthquake_count_per_country = high_intensity_earthquakes['COUNTRY'].value_counts()

# Select the top 20 countries
top_countries = earthquake_count_per_country.head(20)

# Plotting the bar chart
top_countries.plot(kind='bar', figsize=(12, 6), color='red')
plt.title('Top 20 Countries with Earthquakes of Intensity >= 7.5')
plt.xlabel('Country')
plt.ylabel('Number of Earthquakes')
plt.savefig('Resources/high_intensity_earthquakes.png')
plt.show()

## visualising the distribution of earthquakes based on latitude and longitude, 1. using a scatter plot with Matplotlib

In [None]:
# visualising by using a scatter plot with MatplotlibÂ¶
plt.figure(figsize=(10, 8))
plt.scatter(worldwide_earthquake_df['LONGITUDE'], worldwide_earthquake_df['LATITUDE'], alpha=0.5, marker='.')
plt.title('Distribution of Earthquakes Based on Latitude and Longitude')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.savefig('Resources/scatter_plot_map')
plt.show()

## visualising the distribution of earthquakes based on latitude and longitude, 2. interactive map using hvplot and geoviews

In [None]:
# Define a GeoDataFrame with the earthquakes and their coordinates
gdf = gpd.GeoDataFrame(worldwide_earthquake_df, geometry=gpd.points_from_xy(worldwide_earthquake_df['LONGITUDE'], worldwide_earthquake_df['LATITUDE']))

# Specify the hover information 
hover_cols = ['COUNTRY', 'FLAG_TSUNAMI', 'YEAR', 'FOCAL_DEPTH', 'EQ_PRIMARY','INTENSITY']

# Plot the map using hvplot and geoviews
map_plot = gdf.hvplot.points(
    geo=True,
    tiles='CartoLight',
    width=700,
    height=500,
    title='Earthquake Interactive Map',
    hover_cols=hover_cols)

# Saving the fig
plt.savefig('Resources/interacive_map')

# Show the plot
map_plot.opts(
    tools=['hover', 'pan', 'wheel_zoom'],
)

# Analysing patterns in earthquake occurrences related to specific geographical features

### 1. Create a heatmap based on earthquake intensity

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x='LONGITUDE', y='LATITUDE', hue='INTENSITY', size='INTENSITY', data=worldwide_earthquake_df)
plt.title('Earthquake Intensity Distribution')
plt.savefig('Resources/earthquake_intensity_heat_map')
plt.show()

### 2. Investigate the distribution of earthquakes based on focal depth to identify patterns related to the depth of seismic activity.

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(worldwide_earthquake_df['LONGITUDE'], worldwide_earthquake_df['LATITUDE'], c=worldwide_earthquake_df['FOCAL_DEPTH'], cmap='viridis', s=10)
plt.colorbar(label='Focal Depth')
plt.title('Earthquake Distribution by Focal Depth')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.savefig('Resources/focal_depth_distribution_map')
plt.show()

## 3. Time Series Analysis: Explore patterns over time by creating a time series plot of earthquake occurrences

In [None]:
# Using the plot method to create a line plot of the counts of earthquakes for each unique year,
plt.figure(figsize=(12, 6))
worldwide_earthquake_df.groupby('YEAR').size().plot(marker='o')
plt.title('Earthquake Occurrences Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Earthquakes')
plt.grid(True)
plt.savefig('Resources/earthquake_occerance_overtime_lineplot_whole')
plt.show()

Let's zoom and check the year since 1900 onward

In [None]:
plt.figure(figsize=(12, 6))

# Set x-axis limits to start from the year 1900
plt.xlim(1900, max(worldwide_earthquake_df['YEAR']))
worldwide_earthquake_df.groupby('YEAR').size().plot(marker='o')
plt.title('Earthquake Occurrences Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Earthquakes')
plt.grid(True)
plt.savefig('Resources/earthquake_occerance_overtime_lineplot')
plt.show()

This line plot is showing how earthquake counts have changed over time since 1900 till 2020. it has provided  a clear representation of trends and fluctuations An increasing trend is obvious here but to find a trend and check its accuracy using the ststistical parameters, we will create a scatter plot and discuss the result based on our retrieved statistical values..

# Create a scatterplot and Compute Linear Regression 

In [None]:
# Create a pandas Series that holds the counts of earthquakes for each year in our dataset 
grouped_data = worldwide_earthquake_df.groupby('YEAR').size()

# Filter data for years from 1900 onward
filtered_data = grouped_data[grouped_data.index >= 1900]

# Linear Regression(linregress function from the scipy.stats module to perform linear regression on our data) 
slope, intercept, r_value, p_value, std_err = linregress(filtered_data.index, filtered_data.values)

# Plot the regression line
plt.figure(figsize=(12, 6))
plt.scatter(filtered_data.index, filtered_data.values, marker='o', label='Actual Data')

# Plot the regression line and annotate the equation
reg_line = intercept + slope * filtered_data.index
plt.plot(filtered_data.index, reg_line, 'r', label=f'Regression Line: y = {slope:.3f}x + {intercept:.2f}')

plt.title('Linear Regression of Earthquake Occurrences Over Time (1900 Onward)')
plt.xlabel('Year')
plt.ylabel('Number of Earthquakes')
plt.legend()
plt.annotate(f'R-squared: {r_value**2:.3f}\nP-value: {p_value:.3e}\nStandard Error: {std_err:.4f}',
             xy=(0.05, 0.85), xycoords='axes fraction', fontsize=10, color='blue')

# Display regression statistics
print(f"Slope: {slope}\nIntercept: {intercept}\nR-squared: {r_value**2}\nR-value: {r_value}\nP-value: {p_value}\nStandard Error: {std_err}")

plt.savefig('Resources/linear_regression_overtime')

plt.show()

 Analysis: The linear regression results provide valuable information about the relationship between the year and the number of earthquakes. The positive slope(0.2968) suggests that, on average, the number of earthquakes has been increasing over the years. The low p-value (< 0.05) suggests that there is a statistically significant relationship between the year and the number of earthquakes. Our p-value here is (6.10e-19) that there is evidence of an increasing trend in the number of earthquakes over the years. (R-squared: 0.487) ) R-squared indicates the proportion of the variance in the number of earthquakes that is predictable from the year. In this case, approximately 48.7% of the variability in earthquake occurrences can be explained by the linear regression model. It means  the linear model does not explain the majority of the variability in earthquake occurrences, and other factors may contribute to seismic activity. The r-value is  0.6976, it shows a reletively strong relation between the number of earthquakes and the years. Overall, while the linear regression suggests a positive trend, it's essential to interpret the results in the context of the data. We need more geographycal knowledge about earthquake and considering other related factors that can affect the number of earthquaks per  year.

## Visalising the earthquake magnitudes over time to check the trends

In [None]:
# Drop rows with NaN values in 'EQ_PRIMARY' and 'YEAR'
magnitude_time_df = worldwide_earthquake_df.dropna(subset=['EQ_PRIMARY', 'YEAR'])

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(x='YEAR', y='EQ_PRIMARY', data=magnitude_time_df, ci=None, color='orange')
plt.xlabel('Year')
plt.ylabel('Earthquake Magnitude (EQ_PRIMARY)')
plt.title('Trends in Magnitudes of Significant Earthquakes Over Time')
plt.show()

## Visalising the earthquake magnitudes over time(1900 - latest) to check the trends

In [None]:
# Drop rows with NaN values in 'EQ_PRIMARY' and 'YEAR'
magnitude_time_df = worldwide_earthquake_df.dropna(subset=['EQ_PRIMARY', 'YEAR'])

# Set the time range for zooming in
start_year = 1900
end_year = magnitude_time_df['YEAR'].max()

# Filter the DataFrame for the specified time range
zoomed_df = magnitude_time_df[(magnitude_time_df['YEAR'] >= start_year) & (magnitude_time_df['YEAR'] <= end_year)]

# Create a line plot with zoomed-in time range
plt.figure(figsize=(12, 6))
sns.lineplot(x='YEAR', y='EQ_PRIMARY', data=zoomed_df, ci=None, color='orange')
plt.xlabel('Year')
plt.ylabel('Earthquake Magnitude (EQ_PRIMARY)')
plt.title('Trends in Magnitudes of Significant Earthquakes Over Time (1900 - Latest)')
plt.show()