In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score


### Slight Cleaning 

- Set all string variables to have uniformly lower case letters
- Try out setting latitude longitude to numeric
- Drop missing lat/long

In [47]:
data = pd.read_csv('../data/ntsb/cleaned/ntsb_train_cleaned.csv')

for col in data.columns:
    if data[col].dtype == 'object': 
        data[col] = data[col].str.lower()


data = data[~(data['damage']=='UNK')]

for i in ['latitude','longitude']:
    data[i].replace('other/unknown',np.nan,inplace=True)
data = data.dropna().reset_index()

data['latitude'] = data['latitude'].str[:-1].astype('int')
data['longitude'] = data['longitude'].str[:-1].astype('int')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[i].replace('other/unknown',np.nan,inplace=True)


In [48]:
features = ['latitude', 'longitude',
       'apt_dist', 'gust_kts', 'altimeter',
       'aircraft_count', 'num_eng', 'days_since_insp', 'light_cond_DAYL', 'light_cond_DUSK',
       'light_cond_NDRK', 'light_cond_NITE', 'light_cond_other/unknown',
       'BroadPhaseofFlight_Air', 'BroadPhaseofFlight_Ground',
       'BroadPhaseofFlight_Landing', 'BroadPhaseofFlight_Takeoff',
       'BroadPhaseofFlight_other/unknown', 'eng_type_REC', 'eng_type_TF',
       'eng_type_TP', 'eng_type_TS', 'eng_type_other/unknown', 'far_part_091',
       'far_part_121', 'far_part_135', 'far_part_137', 'far_part_PUBU',
       'far_part_other/unknown', 'acft_make_beech', 'acft_make_bell',
       'acft_make_boeing', 'acft_make_cessna', 'acft_make_mooney',
       'acft_make_other/unknown', 'acft_make_piper',
       'acft_make_robinson helicopter', 'acft_category_AIR',
       'acft_category_HELI', 'acft_category_other/unknown', 'homebuilt_N',
       'homebuilt_Y', 'homebuilt_other/unknown', 'fixed_retractable_FIXD',
       'fixed_retractable_RETR', 'fixed_retractable_other/unknown',
       'second_pilot_N', 'second_pilot_Y', 'second_pilot_other/unknown']

target = ['damage']

X = data[features]
y = data[target]

### Train test split

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.3,
                                                    random_state=2025)

In [56]:
y_train

Unnamed: 0,damage
4273,subs
9946,subs
2411,subs
4947,subs
6668,dest
...,...
4256,subs
10124,dest
5331,subs
323,dest


### Grid search: Histogram Gradient Boost & Extra Trees Classifiers

In [57]:
histgrad = HistGradientBoostingClassifier()

# GridSearch 
param_grid = {
    'learning_rate': [0.01,0.05,0.1,0.5,1],
    'max_iter': [100,200,500],
    'max_leaf_nodes': [3,6]
}

grid = GridSearchCV(
    histgrad,
    param_grid,
    scoring='f1_macro',
    cv=5,
)

grid.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [58]:
grid.best_estimator_

In [63]:
grid.best_params_

{'learning_rate': 0.05, 'max_iter': 100, 'max_leaf_nodes': 6}

In [64]:
grid.best_score_

np.float64(0.42810826129894314)

In [None]:
histgrad = ExtraTreesClassifier()

# GridSearch 
param_grid = {
    'learning_rate': [0.01,0.05,0.1,0.5,1],
    'max_iter': [100,200,500]
    'max_leaf_nodes': [3,6]
}

grid = GridSearchCV(
    histgrad,
    param_grid,
    scoring='f1_macro',
    cv=5,
)

grid.fit(X_train,y_train)