In [13]:
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import LogisticRegression as LR
from xgboost import XGBClassifier as XGB
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from imblearn.over_sampling import ADASYN

warnings.simplefilter(action='ignore')

### Tentativa de Predição se o asteroide é perigoso ou não

Definição da função para retornar as métricas

Importar os dados

In [14]:
df_asteroids = pd.read_csv('../data/df_asteroids_st2.csv', index_col= 0)

In [15]:
def scores(y_true, y_pred, features, sampling, model, sampling_method):

    if sampling_method == 'undersampling':
        sampling_method = f'US - {sampling}'  # Descreve a taxa de balanceamento
    elif sampling_method == 'oversampling':
        sampling_method = f'ADASYN - {sampling}'  # Método de amostragem ADASYN
    else:
        sampling_method = 'class_weight'
        
    return {
        "Features": features,
        "Algoritmo": str(model),
        "Acurácia": round(accuracy_score(y_true, y_pred),3),
        "Precisão": round(precision_score(y_true, y_pred, zero_division=0), 3),
        "Recall": round(recall_score(y_true, y_pred, zero_division=0), 3),
        "F1-Score": round(f1_score(y_true, y_pred),3),
        "AUC-ROC": round(roc_auc_score(y_true, y_pred),3),
        "Método": str(sampling_method)
    }

Variáveis para inputar nos modelos

In [16]:
sampling = [0.5, 0.6, 0.7]
sampling_method = ['undersampling', 'oversampling', 'class_weight']
model = [DT(random_state=0), RF(random_state=0), LR(random_state=0, max_iter=200), XGB(random_state=0)]

features = ['feat1', ['relative_velocity_kmh', 'estimated_diameter_mean', 'miss_distance_km', 'absolute_magnitude_h', 'is_sentry_object']]

products = []
for method in sampling_method:
    for samp in (sampling if method != 'class_weight' else [1]):
            for mod in model:
                    products.append({
                        'sampling_method': method,
                        'sampling': samp,
                        'model': mod,
                        'feature': features
                    })

Todas as combinações de variáveis  
Gerando um total de 27 modelos preditivos

In [17]:
pd.DataFrame(products)

Unnamed: 0,sampling_method,sampling,model,feature
0,undersampling,0.5,DecisionTreeClassifier(random_state=0),"[feat1, [relative_velocity_kmh, estimated_diam..."
1,undersampling,0.5,RandomForestClassifier(random_state=0),"[feat1, [relative_velocity_kmh, estimated_diam..."
2,undersampling,0.5,"LogisticRegression(max_iter=200, random_state=0)","[feat1, [relative_velocity_kmh, estimated_diam..."
3,undersampling,0.5,"XGBClassifier(base_score=None, booster=None, c...","[feat1, [relative_velocity_kmh, estimated_diam..."
4,undersampling,0.6,DecisionTreeClassifier(random_state=0),"[feat1, [relative_velocity_kmh, estimated_diam..."
5,undersampling,0.6,RandomForestClassifier(random_state=0),"[feat1, [relative_velocity_kmh, estimated_diam..."
6,undersampling,0.6,"LogisticRegression(max_iter=200, random_state=0)","[feat1, [relative_velocity_kmh, estimated_diam..."
7,undersampling,0.6,"XGBClassifier(base_score=None, booster=None, c...","[feat1, [relative_velocity_kmh, estimated_diam..."
8,undersampling,0.7,DecisionTreeClassifier(random_state=0),"[feat1, [relative_velocity_kmh, estimated_diam..."
9,undersampling,0.7,RandomForestClassifier(random_state=0),"[feat1, [relative_velocity_kmh, estimated_diam..."


Função principal para a criação dos modelos preditivos e testes   
Função baseada no RMMS[Giusti et al. 2022] (REGRESSIVE MULTI-DIMENSIONAL MODEL SELECTION)

In [27]:
def classification_predictions(df, features, model, sampling=None, sampling_method=None):

    X = df[features[1]]
    y = df['is_potentially_hazardous']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    if sampling_method == "undersampling":
        y_train = pd.concat([y_train[y_train==1], y_train[y_train==0].sample(int((len(y_train[y_train == 1]) * sampling)/(1 - sampling)), random_state=0)])
        X_train = X_train.loc[y_train.index]
    elif sampling_method == "oversampling":
        ada = ADASYN(sampling_strategy=sampling, random_state=0)
        X_train, y_train = ada.fit_resample(X_train, y_train)
    elif sampling_method == "class_weight":
        model.set_params(class_weight='balanced')

    # Treinamento do modelo
    model.fit(X_train, y_train)

    # Predições após o treinamento
    y_pred = model.predict(X_test)

    # Métricas avaliativas comparando o conjunto de teste e o conjunto previsto
    result = scores(y_test, y_pred, features[0], sampling, model, sampling_method=sampling_method)   
    return result

Chamada da função principal e armazenamento dos resultados

In [28]:
results = []
for product in products:
                results.append(classification_predictions(
                    df_asteroids, 
                    features=product['feature'], 
                    model=product['model'], 
                    sampling=product['sampling'], 
                    sampling_method=product['sampling_method'], 
                ))

Observando os resultados e ordenando pelo F1-Score e salvando no arquivo 'results_ml_st3.csv'

In [29]:
df_results = pd.DataFrame(results)
df_results.to_csv('../data/results_ml_st3.csv')
df_results.sort_values('F1-Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Features,Algoritmo,Acurácia,Precisão,Recall,F1-Score,AUC-ROC,Método
0,feat1,RandomForestClassifier(class_weight='balanced'...,0.894,0.351,0.793,0.487,0.847,ADASYN - 0.7
1,feat1,RandomForestClassifier(class_weight='balanced'...,0.893,0.338,0.728,0.462,0.816,ADASYN - 0.6
2,feat1,RandomForestClassifier(class_weight='balanced'...,0.897,0.346,0.696,0.462,0.803,ADASYN - 0.5
3,feat1,"XGBClassifier(base_score=None, booster=None, c...",0.904,0.355,0.641,0.457,0.781,ADASYN - 0.5
4,feat1,DecisionTreeClassifier(class_weight='balanced'...,0.899,0.347,0.663,0.455,0.789,ADASYN - 0.7
5,feat1,DecisionTreeClassifier(class_weight='balanced'...,0.86,0.296,0.88,0.443,0.869,US - 0.6
6,feat1,RandomForestClassifier(class_weight='balanced'...,0.846,0.287,0.957,0.441,0.898,US - 0.6
7,feat1,RandomForestClassifier(class_weight='balanced'...,0.859,0.29,0.848,0.432,0.854,US - 0.7
8,feat1,"XGBClassifier(base_score=None, booster=None, c...",0.849,0.283,0.902,0.431,0.874,US - 0.6
9,feat1,"XGBClassifier(base_score=None, booster=None, c...",0.894,0.324,0.62,0.425,0.766,ADASYN - 0.7
