In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
#loading the raw csv of all storms
data = pd.read_csv("/content/drive/My Drive/GDToT/ECO482_Project/Data/raw_storm_data.csv")

In [15]:
#dropping columns that aren't useful
pd.set_option('display.max_columns', None)
data = data.drop(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH', 'END_DAY', 'END_TIME', 'DATA_SOURCE', 'DURATION',
              'DURATION_MINS', 'WFO', 'SOURCE', 'MAGNITUDE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'CATEGORY', 'YEAR', 'MONTH_NAME',
              'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE', 'BEGIN_AZIMUTH',
              'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH', 'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON'], axis=1)

In [16]:
#dropping all rows with null values for TOR_F_SCALE
data = data.dropna(subset=['TOR_F_SCALE'])

#dropping all rows where TOR_F_SCALE is not on the EF scale
EF_SCALE = ['EF0', 'EF1', 'EF2', 'EF3', 'EF4', 'EF5', 'EFU']
data = data[data['TOR_F_SCALE'].astype(str).isin(EF_SCALE)]

In [17]:
#creating a sentiment index based on event and episode narrative text columns

!pip install pandas textblob nltk afinn
import nltk
from afinn import Afinn
from textblob import TextBlob

# Ensure required nltk data is downloaded
nltk.download('sentiwordnet')
nltk.download('wordnet')

afinn = Afinn()

#function to compute sentiment scores
def analyze_sentiment(text):
    # TextBlob sentiment (polarity ranges from -1 to 1)
    blob_score = TextBlob(text).sentiment.polarity

    # AFINN sentiment (ranges from negative to positive integer scores)
    afinn_score = afinn.score(text)

    return pd.Series([blob_score, afinn_score], index=['TextBlob_Score', 'AFINN_Score'])


data[['TextBlob_Score', 'AFINN_Score']] = data['EVENT_NARRATIVE'].str.lower().apply(analyze_sentiment)



[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
#creating dummy variables out of categories
data = pd.DataFrame(pd.get_dummies(data, columns=['STATE', 'CZ_TYPE', 'TOR_F_SCALE']))

#converting to 0,1
for col in data.columns:
    if data[col].dtype == 'bool':  # Check if column is boolean
        data[col] = data[col].astype(int)

In [19]:
data.columns

Index(['EPISODE_ID', 'EVENT_ID', 'STATE_FIPS', 'EVENT_TYPE', 'CZ_FIPS',
       'CZ_NAME', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE', 'END_DATE_TIME',
       'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
       'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'TOR_LENGTH',
       'TOR_WIDTH', 'EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'NET_DAMAGE',
       'BEGIN', 'END', 'DURATION_HOURS', 'TextBlob_Score', 'AFINN_Score',
       'STATE_ALABAMA', 'STATE_ALASKA', 'STATE_ARIZONA', 'STATE_ARKANSAS',
       'STATE_CALIFORNIA', 'STATE_COLORADO', 'STATE_CONNECTICUT',
       'STATE_DELAWARE', 'STATE_DISTRICT OF COLUMBIA', 'STATE_FLORIDA',
       'STATE_GEORGIA', 'STATE_HAWAII', 'STATE_IDAHO', 'STATE_ILLINOIS',
       'STATE_INDIANA', 'STATE_IOWA', 'STATE_KANSAS', 'STATE_KENTUCKY',
       'STATE_LOUISIANA', 'STATE_MAINE', 'STATE_MARYLAND',
       'STATE_MASSACHUSETTS', 'STATE_MICHIGAN', 'STATE_MINNESOTA',
       'STATE_MISSISSIPPI', 'STATE_MISSOURI', 'STATE_MONTANA',
       'STATE_NEBRASKA', 

In [20]:
clean_data = data.drop(columns=['EVENT_NARRATIVE', 'EPISODE_ID', 'EVENT_ID',
                                'STATE_FIPS', 'EVENT_TYPE', 'CZ_FIPS',
                                'CZ_NAME','BEGIN_DATE_TIME', 'CZ_TIMEZONE',
                                'END_DATE_TIME','DAMAGE_PROPERTY',
                                'DAMAGE_CROPS','EPISODE_NARRATIVE',
                                'EVENT_NARRATIVE', 'AFINN_Score', 'CZ_TYPE_C'])

In [21]:
clean_data

Unnamed: 0,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,TOR_LENGTH,TOR_WIDTH,NET_DAMAGE,BEGIN,END,DURATION_HOURS,TextBlob_Score,STATE_ALABAMA,STATE_ALASKA,STATE_ARIZONA,STATE_ARKANSAS,STATE_CALIFORNIA,STATE_COLORADO,STATE_CONNECTICUT,STATE_DELAWARE,STATE_DISTRICT OF COLUMBIA,STATE_FLORIDA,STATE_GEORGIA,STATE_HAWAII,STATE_IDAHO,STATE_ILLINOIS,STATE_INDIANA,STATE_IOWA,STATE_KANSAS,STATE_KENTUCKY,STATE_LOUISIANA,STATE_MAINE,STATE_MARYLAND,STATE_MASSACHUSETTS,STATE_MICHIGAN,STATE_MINNESOTA,STATE_MISSISSIPPI,STATE_MISSOURI,STATE_MONTANA,STATE_NEBRASKA,STATE_NEVADA,STATE_NEW HAMPSHIRE,STATE_NEW JERSEY,STATE_NEW MEXICO,STATE_NEW YORK,STATE_NORTH CAROLINA,STATE_NORTH DAKOTA,STATE_OHIO,STATE_OKLAHOMA,STATE_OREGON,STATE_PENNSYLVANIA,STATE_PUERTO RICO,STATE_RHODE ISLAND,STATE_SOUTH CAROLINA,STATE_SOUTH DAKOTA,STATE_TENNESSEE,STATE_TEXAS,STATE_UTAH,STATE_VERMONT,STATE_VIRGIN ISLANDS,STATE_VIRGINIA,STATE_WASHINGTON,STATE_WEST VIRGINIA,STATE_WISCONSIN,STATE_WYOMING,TOR_F_SCALE_EF0,TOR_F_SCALE_EF1,TOR_F_SCALE_EF2,TOR_F_SCALE_EF3,TOR_F_SCALE_EF4,TOR_F_SCALE_EF5,TOR_F_SCALE_EFU
2,0,0,0,0,6.70,400.0,150000.0,2024-05-19 18:39:00,2024-05-19 19:02:00,0.383333,-0.011054,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
18,0,0,0,0,0.32,50.0,120000.0,2024-11-05 11:01:00,2024-11-05 11:02:00,0.016667,-0.053750,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
105,0,0,0,0,2.58,110.0,0.0,2024-02-27 19:32:00,2024-02-27 19:35:00,0.050000,-0.011111,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
106,0,0,0,0,2.45,160.0,0.0,2024-02-27 19:34:00,2024-02-27 19:37:00,0.050000,0.015278,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
111,0,0,0,0,0.05,10.0,0.0,2024-02-27 18:15:00,2024-02-27 18:16:00,0.016667,-0.077778,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150112,0,0,0,0,12.24,500.0,250000.0,2007-03-01 12:31:00,2007-03-01 12:48:00,0.283333,0.026667,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1150122,0,0,0,0,0.90,400.0,100000.0,2007-03-01 16:06:00,2007-03-01 16:08:00,0.033333,0.188955,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1150126,4,0,1,0,7.69,448.0,500000.0,2007-03-01 17:29:00,2007-03-01 17:40:00,0.183333,-0.073016,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1150127,8,0,2,0,32.46,1790.0,110000000.0,2007-03-01 21:07:00,2007-03-01 21:36:00,0.483333,-0.018287,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [22]:
clean_data.to_csv('clean_data.csv', index=False)

from google.colab import files
files.download('clean_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>