## Project 1, tornadoes and mobile home parks in the US, risk analysis.

In [None]:
# import dependencies

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import geoviews as gv
import geoviews.tile_sources as gvts
import hvplot.pandas
import matplotlib.colors as mcolors
import numpy as np
from scipy.stats import linregress


In [None]:
# define paths for both files

tornadoes_data_path = "1950-2021_all_tornadoes.csv"
mobile_home_parks_path = "Mobile_Home_Parks.csv"

# Read the data from both csv's 

tornadoes_data = pd.read_csv(tornadoes_data_path)
mobile_home_data = pd.read_csv(mobile_home_parks_path)

display(tornadoes_data)
display(mobile_home_data) 


In [None]:
# CLEANING DATA FOR BOTH CSV FILES
# Rename columns in tornadoes_data
clean_tornadoes_data = tornadoes_data.rename(columns={
    "yr": "year",
    "mo": "month",
    "dy": "day",
    "st": "state",
    "mag": "magnitude",
    "inj": "injuries",
    "fat": "fatalities",
    "loss": "propertyloss",
    "closs": "lossinmillions",
    "slat": "latitude",
    "slon": "longitude",
    "elat": "latitudeend",
    "elon": "longitudeend",
    "len": "lengthinmiles",
    "wid": "widthinyards",
    })
# Delete columns in tornadoes_data
clean_tornadoes_data = clean_tornadoes_data.drop(columns={"om","tz","stf","stn","ns","sn","sg","f1","f2","f3","f4","fc"})
clean_tornadoes_data = clean_tornadoes_data[(clean_tornadoes_data['latitude'] != 0) | (clean_tornadoes_data['longitude'] != 0)]
clean_tornadoes_data['latitude'] = clean_tornadoes_data['latitude'].round(1)
clean_tornadoes_data['longitude'] = clean_tornadoes_data['longitude'].round(1)
display(clean_tornadoes_data)
# Rename columns in mobile_home_data
clean_mobile_home_data = mobile_home_data.rename(columns={
    "NAME": "name",
    "ADDRESS": "address",
    "CITY": "city",
    "STATE": "state",
    "ZIP": "zipcode",
    "TELEPHONE": "telephone",
    "TYPE": "type",
    "STATUS": "status",
    "COUNTY": "county",
    "LATITUDE": "latitude",
    "LONGITUDE": "longitude"
    })
# Delete columns in mobile home parks data
clean_mobile_home_data = clean_mobile_home_data.drop(columns={
"X", "Y", "FID", "MHPID", "ADDRESS2", "ZIP4", "COUNTYFIPS",
"COUNTRY", "NAICS_CODE", "NAICS_DESC", "SOURCE",
"SOURCEDATE", "VAL_METHOD", "VAL_DATE", "WEBSITE","index",
"UNITS", "SIZE", "YEARBLT", "RevGeoFlag"})
clean_mobile_home_data = clean_mobile_home_data[(clean_mobile_home_data['latitude'] != 0) | (clean_mobile_home_data['longitude'] != 0)]
# Round coordinates to 4 decimal places
clean_mobile_home_data['latitude'] = clean_mobile_home_data['latitude'].round(1)
clean_mobile_home_data['longitude'] = clean_mobile_home_data['longitude'].round(1)
display(clean_mobile_home_data)

In [None]:
#MERGING THE DATA
#setting display options to preview all columns
pd.set_option('display.max_columns', None) 

tornado_mobiles_merge=pd.merge(clean_tornadoes_data, clean_mobile_home_data, left_on=["longitude","latitude"],right_on=["longitude","latitude"])
merge_data_df = pd.DataFrame(tornado_mobiles_merge)

# Export the tornado_mobiles_merge data into a csv
merge_data_df.to_csv("../Project1Bootcamp/tornadoesandparksmerged.csv")

merge_data_df

In [None]:

clean_tornadoes_data['year'] = pd.to_datetime(clean_tornadoes_data['year'], format='%Y')

# Group the data by year and calculate the tornado count for each year
tornadoes_by_year = clean_tornadoes_data.groupby(clean_tornadoes_data['year'].dt.year).size()

# Plot the tornado count over time
plt.figure(figsize=(10, 6))
plt.plot(tornadoes_by_year.index, tornadoes_by_year.values, marker='o', linestyle='-')
plt.title('Tornado Count by Year (up to 2009)')
plt.xlabel('Year')
plt.ylabel('Tornado Count')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Define a dictionary to map magnitudes to Fujita scale categories
fujita_scale_map = {0: 'F0', 1: 'F1', 2: 'F2', 3: 'F3', 4: 'F4', 5: 'F5'}
# Apply the mapping to create the 'fujita_scale_category' column
clean_tornadoes_data['fujita_scale_category'] = clean_tornadoes_data['magnitude'].map(fujita_scale_map)
# Calculate the distribution of the Fujita scale category
fujita_scale_distribution = clean_tornadoes_data['fujita_scale_category'].value_counts()
# Calculate the percentage of each category
percentage = (fujita_scale_distribution / fujita_scale_distribution.sum()) * 100
# Plot a bar chart to show the distribution of the Fujita scale category
plt.figure(figsize=(10, 6))
bars = fujita_scale_distribution.plot(kind='bar', color='skyblue')
plt.title('Distribution of Tornadoes by Fujita Scale Category')
plt.xlabel('Fujita Scale Category')
plt.ylabel('Tornado Count')
plt.xticks(rotation=0)  # Rotate x-axis labels if needed
plt.grid(axis='y')  # Add grid lines to the y-axis
# Annotate bars with percentage values
for i, val in enumerate(fujita_scale_distribution):
    plt.text(i, val + 5, f'{percentage[i]:.2f}%', ha='center')
plt.show()

In [None]:
# Count occurrences of tornadoes by state and sort from highest to lowest
tornado_count_by_state = clean_tornadoes_data["state"].value_counts().sort_values(ascending=False)
#Print the sorted tornado counts by state
#print(tornado_count_by_state)
# Visualizar los estados más afectados por los tornados (top 10)
top_affected_states = tornado_count_by_state.head(10)
print("Top 10 Estados más afectados por tornados:")
print(top_affected_states)
# Visualizar los estados más afectados por los tornados (top 10) en un gráfico de barras horizontal
plt.figure(figsize=(10, 6))
tornado_count_by_state.head(10).sort_values().plot(kind='barh', color='skyblue')
plt.title('Top 10 Estados más afectados por tornados')
plt.xlabel('Número de tornados')
plt.ylabel('Estado')
plt.show()

In [None]:
# Número total de víctimas (lesionados y fallecidos) por estado
tornado_victims_by_state = clean_tornadoes_data.groupby('state')[['injuries', 'fatalities']].sum()
# Seleccionar los diez estados con el mayor número total de víctimas
top_10_states = tornado_victims_by_state.sum(axis=1).nlargest(10).index
tornado_victims_top_10 = tornado_victims_by_state.loc[top_10_states]
# Visualizar el número total de víctimas (lesionados y fallecidos) por estado en un gráfico de barras horizontales
plt.figure(figsize=(10, 8))
bar_width = 0.4
index = np.arange(len(tornado_victims_top_10))
plt.barh(index, tornado_victims_top_10['injuries'], bar_width, label='Lesionados', color='skyblue')
plt.barh(index + bar_width, tornado_victims_top_10['fatalities'], bar_width, label='Fallecidos', color='salmon')
plt.xlabel('Número de víctimas')
plt.ylabel('Estado')
plt.title('Número total de víctimas por estado (Top 10)')
plt.yticks(index + bar_width / 2, tornado_victims_top_10.index)
plt.legend()
plt.gca().invert_yaxis()  # Invertir el eje y para que los estados aparezcan en orden descendente
plt.tight_layout()
plt.show()

## Code for printing map with all of the low-risk sites

In [None]:
# Using Groupby to add tornado counts to matching locations on the merge_data_df
tornado_count_by_location = merge_data_df.groupby(['latitude', 'longitude']).size().reset_index(name='tornado_count')
print(tornado_count_by_location)

# Step 1: Calculate Tornado Count by Location
tornado_count_by_location['yearly_average'] = tornado_count_by_location['tornado_count'] / 71
print(tornado_count_by_location)


In [None]:
# Define risk thresholds
low_threshold = 1
moderate_threshold = 3
high_threshold = 6

# Classify yearly average tornado counts into risk scale
def classify_tornado_risk(yearly_average):
    if yearly_average <= low_threshold:
        return 'Low Risk'
    elif yearly_average <= moderate_threshold:
        return 'Moderate Risk'
    elif yearly_average <= high_threshold:
        return 'High Risk'
    else:
        return 'Very High Risk'

# Add a new column 'risk_category' to tornado_count_by_location DataFrame
tornado_count_by_location['risk_category'] = tornado_count_by_location['yearly_average'].apply(classify_tornado_risk)

# Display the updated DataFrame
print(tornado_count_by_location)


In [None]:
# Define custom colors for risk categories
risk_colors = {
    'Low Risk': 'green',
}

# Filter locations with risk category other than "Low Risk"
high_risk_locations = tornado_count_by_location[tornado_count_by_location['risk_category'] == 'Low Risk']

# Plot the map using hvplot with geo=True and OSM tiles
risk_map1 = high_risk_locations.hvplot(
    kind='points',
    x='longitude',
    y='latitude',
    c='risk_category',
    tiles='OSM',
    colorbar=True,
    cmap=mcolors.ListedColormap([risk_colors[r] for r in risk_colors if r == 'Low Risk']),
    frame_width=800,
    frame_height=600,
    title='Tornado Risk Classification',
    geo=True,
)

# Display the map
risk_map1

## Code for printing map with moderate, high, and very high risk sites

In [None]:
# Define custom colors for risk categories
risk_colors = {
    'Moderate Risk': 'blue',
    'High Risk': 'orange',
    'Very High Risk': 'red'
}

# Filter locations with risk category other than "Low Risk"
high_risk_locations = tornado_count_by_location[tornado_count_by_location['risk_category'] != 'Low Risk']

# Plot the map using hvplot with geo=True and OSM tiles
risk_map2 = high_risk_locations.hvplot(
    kind='points',
    x='longitude',
    y='latitude',
    c='risk_category',
    tiles='OSM',
    colorbar=True,
    cmap=mcolors.ListedColormap([risk_colors[r] for r in risk_colors if r != 'Low Risk']),
    frame_width=800,
    frame_height=600,
    title='Tornado Risk Classification',
    geo=True,
)

# Display the map
risk_map2


In [None]:
# Scatter plot
plt.scatter(merge_data_df['year'], y_test, label='Actual')
plt.scatter(merge_data_df['year'], y_pred, color='red', label='Predicted')

# Add linear regression line
slope, intercept, rvalue, pvalue, stderr = linregress(X_test['year'], y_test)
regress_values = merge_data_df['year'] * slope + intercept
line_eq = "y = " + str(round(slope, 2)) + "x + " + str(round(intercept, 2))
plt.plot(merge_data_df['year'], regress_values, "r-", label=line_eq)

# Add labels and legend
plt.xlabel('Year')
plt.ylabel('Tornado Occurrences')
plt.title('Linear Regression: Tornado Occurrences vs Year')
plt.legend()

plt.show()