In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import load

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
df = pd.read_csv('data/combined_dfs.csv', delimiter=',')

In [5]:
df.dropna()

Unnamed: 0,Year,Annual_Cyclones,Sahel_Annual,AMO_Annual,ENSO_Annual,TEMP_Annual
89,1950,1,6.952756,-0.023750,-1.300,79.10
90,1951,2,2.996063,0.187333,-0.325,80.04
91,1952,1,6.259843,0.278583,0.375,79.24
92,1953,2,5.590551,0.247417,0.575,78.62
93,1954,1,6.688976,0.025750,0.225,79.44
...,...,...,...,...,...,...
157,2018,0,1.400000,0.033500,-0.750,81.02
158,2019,2,1.400000,0.139083,0.700,81.56
159,2020,1,1.600000,0.268167,0.400,81.00
160,2021,0,1.600000,0.222583,-0.850,80.06


In [6]:
hurricane_only_df = load("joblib_files/hurricane_only.joblib")

In [7]:
hurricane_only_df

Unnamed: 0,Cyclone,Year
1852,0,1852
1853,1,1853
1854,1,1854
1855,0,1855
1856,0,1856
...,...,...
2018,0,2018
2019,1,2019
2020,0,2020
2021,0,2021


In [8]:
conus_precip = pd.read_csv('data/conus_jul_sep_precip.csv', delimiter=',', skiprows=4)

In [9]:
conus_precip['Date'] = conus_precip['Date'].astype(str).str[:-2].astype(int)

In [10]:
conus_precip = conus_precip.drop(columns='Value')

In [11]:
conus_precip = conus_precip.rename(columns={'Date':'Year', 'Anomaly':'precip'})

In [12]:
conus_temp = pd.read_csv('data/conus_jul_sep_tmp.csv', delimiter=',', skiprows=4)

In [13]:
conus_temp = conus_temp.drop(columns='Value')
conus_temp['Date'] = conus_temp['Date'].astype(str).str[:-2].astype(int)
conus_temp = conus_temp.rename(columns={'Date':'Year', 'Anomaly':'Conus_temp'})
conus_temp

Unnamed: 0,Year,Conus_temp
0,1950,-1.94
1,1951,-0.30
2,1952,0.75
3,1953,0.62
4,1954,1.18
...,...,...
68,2018,1.97
69,2019,2.07
70,2020,1.90
71,2021,2.21


In [14]:
ne_us_temp = pd.read_csv('data/ne_us_jul_sep_tmp.csv', delimiter=',', skiprows=4)

In [15]:
ne_us_temp = ne_us_temp.drop(columns='Value')
ne_us_temp['Date'] = ne_us_temp['Date'].astype(str).str[:-2].astype(int)
ne_us_temp = ne_us_temp.rename(columns={'Date':'Year', 'Anomaly':'NE_temp'})
ne_us_temp

Unnamed: 0,Year,NE_temp
0,1950,-1.8
1,1951,-0.4
2,1952,1.6
3,1953,0.3
4,1954,-1.3
...,...,...
68,2018,3.9
69,2019,2.1
70,2020,2.8
71,2021,2.5


In [16]:
nao_sst = pd.read_csv('data/nao.txt', delim_whitespace=True, skiprows=1, names=['Year', 'nao_sst'])
nao_sst

Unnamed: 0,Year,nao_sst
0,1950,0.130
1,1951,0.198
2,1952,-0.262
3,1953,0.512
4,1954,-0.440
...,...,...
70,2020,-0.018
71,2021,-0.278
72,2022,0.040
73,2023,-1.204


In [17]:
nta_sst = pd.read_csv('data/nta_sst.txt', delim_whitespace=True)
nta_sst

Unnamed: 0,Year,nta_index
0,1950,-0.450
1,1951,-0.126
2,1952,-0.162
3,1953,-0.066
4,1954,-0.502
...,...,...
68,2018,-0.186
69,2019,0.130
70,2020,0.150
71,2021,0.150


In [18]:
tna_sst = pd.read_csv('data/TNA_sst.txt', delim_whitespace=True)
tna_sst

Unnamed: 0,Year,TNA_sst
0,1950,-0.142
1,1951,0.204
2,1952,0.248
3,1953,0.182
4,1954,-0.166
...,...,...
68,2018,-0.012
69,2019,0.420
70,2020,0.618
71,2021,0.330


In [19]:
amm_sst = pd.read_csv('data/AMM_sst.txt', delim_whitespace=True)
amm_sst

Unnamed: 0,Year,AMM_sst
0,1950,0.846
1,1951,1.338
2,1952,2.398
3,1953,1.258
4,1954,-0.442
...,...,...
68,2018,-2.470
69,2019,0.684
70,2020,1.178
71,2021,-1.480


In [20]:
nina_index = pd.read_csv('data/Nina_index.txt', delim_whitespace=True)
nina_index

Unnamed: 0,Year,Nina_index
0,1950,1.040
1,1951,0.824
2,1952,0.034
3,1953,0.774
4,1954,-0.730
...,...,...
68,2018,0.312
69,2019,0.282
70,2020,-0.668
71,2021,-0.550


In [21]:
tsa_index = pd.read_csv('data/TSA_index.txt', delim_whitespace=True)
tsa_index

Unnamed: 0,Year,TSA_index
0,1950,-0.356
1,1951,-0.048
2,1952,-0.026
3,1953,-0.032
4,1954,-0.402
...,...,...
68,2018,0.428
69,2019,0.488
70,2020,0.456
71,2021,0.830


In [22]:
nao_jones = pd.read_csv('data/nao_jones.txt', delim_whitespace=True)
nao_jones

Unnamed: 0,Year,NAO_Jones
0,1950,0.2850
1,1951,0.4000
2,1952,-1.1100
3,1953,1.2275
4,1954,1.2175
...,...,...
68,2018,1.0925
69,2019,-1.0300
70,2020,-0.5725
71,2021,-0.9250


In [23]:
rh_value = pd.read_csv('data/rh_mdr.txt', delim_whitespace=True)
rh_value

Unnamed: 0,Year,rh_value
0,1950,50.404
1,1951,41.816
2,1952,47.056
3,1953,44.510
4,1954,44.732
...,...,...
69,2019,42.053
70,2020,42.212
71,2021,39.227
72,2022,43.121


In [24]:
nj_precip = pd.read_csv('data/nj_precip.csv', delimiter=',', skiprows=4)
nj_precip = nj_precip.drop(columns='Anomaly')
nj_precip['Date'] = nj_precip['Date'].astype(str).str[:-2].astype(int)
nj_precip = nj_precip.rename(columns={'Date':'Year', 'Value':'NJ_precip'})
nj_precip

Unnamed: 0,Year,NJ_precip
0,1950,8.18
1,1951,4.89
2,1952,10.93
3,1953,4.86
4,1954,11.86
...,...,...
68,2018,13.13
69,2019,4.98
70,2020,10.50
71,2021,12.93


In [25]:
df = df.merge(conus_precip, on='Year', how='left')

In [26]:
df = df.merge(conus_temp, on='Year', how='left')
df = df.merge(ne_us_temp, on='Year', how='left')

In [27]:
df = df.merge(nao_sst, on='Year', how='left')

In [28]:
df = df.merge(nta_sst, on='Year', how='left')

In [29]:
df = df.merge(tna_sst, on='Year', how='left')

In [30]:
df = df.merge(amm_sst, on='Year', how='left')

In [31]:
df = df.merge(nina_index, on='Year', how='left')

In [32]:
df = df.merge(tsa_index, on='Year', how='left')

In [33]:
df = df.merge(nao_jones, on='Year', how='left')

In [34]:
df = df.merge(rh_value, on='Year', how='left')

In [35]:
df = df.merge(nj_precip, on='Year', how='left')

In [36]:
df = df.merge(hurricane_only_df, on='Year', how='left')

In [37]:
df = df.dropna()

In [38]:
df.to_csv('combined_df_30jul.csv')

In [39]:
df.tail(20)

Unnamed: 0,Year,Annual_Cyclones,Sahel_Annual,AMO_Annual,ENSO_Annual,TEMP_Annual,precip,Conus_temp,NE_temp,nao_sst,nta_index,TNA_sst,AMM_sst,Nina_index,TSA_index,NAO_Jones,rh_value,NJ_precip,Cyclone
142,2003,2,3.165354,0.197583,0.475,78.94,0.14,1.49,1.6,-0.104,0.31,0.526,1.518,0.136,0.402,-1.0575,35.292,11.18,0
143,2004,0,-0.988189,0.172417,0.275,79.78,1.19,-0.62,0.6,-0.068,0.318,0.524,2.76,0.552,0.138,-0.1825,36.387,10.88,0
144,2005,3,0.862205,0.25775,0.5,80.02,-0.25,1.77,3.3,0.068,0.484,0.754,4.12,-0.098,-0.132,0.2275,34.888,3.31,2
145,2006,1,0.192913,0.2325,-0.675,79.12,0.18,0.85,1.0,-0.724,0.364,0.568,1.302,0.336,0.308,-1.1575,37.163,9.61,0
146,2007,1,-0.622047,0.113333,0.125,80.34,-0.23,1.74,1.1,-0.004,-0.022,0.122,0.182,-0.85,0.238,-1.415,36.998,5.48,0
147,2008,1,2.322835,0.103833,-1.35,78.72,0.65,0.33,0.8,-0.468,0.178,0.458,0.796,-0.36,0.484,-0.85,35.414,8.5,0
148,2009,0,1.433071,0.004667,-0.625,81.0,0.42,0.11,-0.3,-0.476,0.194,0.378,-0.03,0.606,0.372,-0.5025,39.086,11.38,0
149,2010,0,3.515748,0.315083,0.975,80.64,0.73,1.56,2.8,-0.744,0.638,0.922,4.646,-1.252,0.518,-1.685,43.54,5.69,0
150,2011,1,-0.84252,0.06725,-1.05,79.64,-0.62,2.22,2.8,-0.54,0.226,0.462,1.836,-0.676,0.238,-0.38,41.68,22.79,0
151,2012,0,4.692913,0.178833,-0.675,79.2,-0.56,2.1,2.3,-1.418,0.358,0.526,2.346,0.25,-0.004,-1.6,41.928,9.07,0


In [40]:
# Drop rows with any NaN values
df_cleaned = df.copy()
df_cleaned = df_cleaned.dropna()

# Convert Annual_Cyclones to binary (0 or 1) indicating presence or absence of cyclones
df_cleaned['Cyclones_Binary'] = df_cleaned['Annual_Cyclones'].apply(lambda x: 1 if x > 0 else 0)

# Define the feature variables (X) and the target variable (y)
# Taking Conus_temp out
X = df_cleaned[['Sahel_Annual', 'AMO_Annual', 'ENSO_Annual', 'TEMP_Annual', 'precip',  
                'NE_temp', 'nao_sst', 'nta_index', 'TNA_sst', 'AMM_sst', 'Nina_index', 
                'NAO_Jones', 'rh_value', 'NJ_precip_x', 'NJ_precip_y']]
y = df_cleaned['Cyclones_Binary']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and fit the logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the confusion matrix and classification report
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


KeyError: "['NJ_precip_x', 'NJ_precip_y'] not in index"

In [None]:
df_cleaned.corr()

Unnamed: 0,Year,Annual_Cyclones,Sahel_Annual,AMO_Annual,ENSO_Annual,TEMP_Annual,precip,Conus_temp,NE_temp,nao_sst,nta_index,TNA_sst,AMM_sst,Nina_index,TSA_index,NAO_Jones,rh_value,NJ_precip_x,NJ_precip_y,Cyclones_Binary
Year,1.0,-0.040246,-0.314554,0.326512,-0.045166,0.57102,0.201689,0.579306,0.549313,-0.242964,0.666542,0.570931,0.118835,-0.138668,0.600965,-0.286353,-0.251235,0.063397,0.063397,-0.074668
Annual_Cyclones,-0.040246,1.0,0.047447,-0.002468,-0.12716,-0.115234,-0.212236,-0.030828,-0.020191,0.044508,-0.046992,-0.018859,0.012057,-0.213207,0.042459,0.080502,-0.055641,-0.179382,-0.179382,0.785847
Sahel_Annual,-0.314554,0.047447,1.0,0.411304,0.001391,-0.02241,-0.048794,0.028598,-0.023741,-0.136303,0.008489,0.114508,0.376326,-0.078993,-0.310548,0.092733,0.7009,0.073126,0.073126,0.189406
AMO_Annual,0.326512,-0.002468,0.411304,1.0,0.231392,0.526158,-0.143148,0.688945,0.500265,-0.39166,0.768829,0.824008,0.727346,-0.102288,0.264368,-0.154812,0.53788,-0.03477,-0.03477,0.081061
ENSO_Annual,-0.045166,-0.12716,0.001391,0.231392,1.0,0.255217,-0.005654,0.211561,0.174067,-0.154341,0.281124,0.235877,0.259766,0.125943,-0.101662,0.0939,-0.022111,-0.393159,-0.393159,-0.159631
TEMP_Annual,0.57102,-0.115234,-0.02241,0.526158,0.255217,1.0,0.171351,0.608772,0.557801,-0.293099,0.55098,0.529827,0.257838,-0.116778,0.445269,-0.081788,0.154396,-0.016793,-0.016793,-0.150014
precip,0.201689,-0.212236,-0.048794,-0.143148,-0.005654,0.171351,1.0,-0.2689,0.121395,0.125056,0.004235,-0.036598,-0.122925,0.055338,0.160363,0.146193,-0.072863,0.073865,0.073865,-0.195048
Conus_temp,0.579306,-0.030828,0.028598,0.688945,0.211561,0.608772,-0.2689,1.0,0.692757,-0.286317,0.654776,0.640079,0.389095,-0.255981,0.390268,-0.213471,0.209727,0.023009,0.023009,-0.054702
NE_temp,0.549313,-0.020191,-0.023741,0.500265,0.174067,0.557801,0.121395,0.692757,1.0,-0.126536,0.531683,0.498723,0.268246,-0.264611,0.438849,-0.203938,0.156407,0.066374,0.066374,-0.133952
nao_sst,-0.242964,0.044508,-0.136303,-0.39166,-0.154341,-0.293099,0.125056,-0.286317,-0.126536,1.0,-0.414569,-0.455384,-0.331506,-0.021549,-0.221531,0.42196,-0.130791,0.078,0.078,0.059879


In [42]:
df_cleaned.describe()

Unnamed: 0,Year,Annual_Cyclones,Sahel_Annual,AMO_Annual,ENSO_Annual,TEMP_Annual,precip,Conus_temp,NE_temp,nao_sst,nta_index,TNA_sst,AMM_sst,Nina_index,TSA_index,NAO_Jones,rh_value,NJ_precip,Cyclone,Cyclones_Binary
count,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0
mean,1986.0,1.041096,1.531399,-0.005741,-0.020548,79.251233,0.131918,0.419315,0.591781,0.048411,-0.102055,0.131178,0.205616,0.034575,0.091699,-0.192534,37.951192,8.440548,0.232877,0.643836
std,21.217131,0.992169,2.571764,0.1949,0.777576,0.861736,0.732506,1.088853,1.508196,0.583351,0.315119,0.312885,1.84741,0.749521,0.329996,0.855123,5.053764,3.358072,0.54059,0.482179
min,1950.0,0.0,-4.011811,-0.4475,-1.4,77.2,-1.49,-1.94,-2.4,-1.418,-0.75,-0.492,-4.648,-1.284,-0.666,-1.845,28.112,3.31,0.0,0.0
25%,1968.0,0.0,-0.200787,-0.139333,-0.625,78.7,-0.23,-0.38,-0.5,-0.384,-0.354,-0.14,-0.74,-0.518,-0.132,-0.8675,33.867,5.92,0.0,0.0
50%,1986.0,1.0,1.468504,0.019333,0.025,79.18,0.07,0.33,0.3,0.046,-0.128,0.122,0.46,-0.056,0.138,-0.115,37.288,7.93,0.0,1.0
75%,2004.0,2.0,3.515748,0.139083,0.45,79.74,0.65,1.21,1.6,0.436,0.15,0.378,1.258,0.482,0.372,0.49,41.68,10.5,0.0,1.0
max,2022.0,4.0,6.952756,0.339667,1.775,81.56,1.53,2.83,3.9,1.708,0.638,0.922,4.646,1.838,0.83,1.755,50.404,22.79,2.0,1.0
