In [29]:

import pandas as pd
import numpy as np

from pycaret.classification import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [30]:
# Carregando os dados
ais_data_1 = pd.read_csv(r'..\Data\ais_data.csv')
ais_data_2 = pd.read_csv(r'..\Data\AIS_UNACORN_Seatracks_past12-hours.csv')

# Removendo colunas desnecessárias e tratando valores ausentes
ais_data_1_cleaned = ais_data_1.drop(columns=["Unnamed: 0"]).dropna(subset=["sog", "cog", "heading", "shiptype"])
ais_data_2_cleaned = ais_data_2.drop(columns=["Unnamed: 0"]).dropna(subset=["sog", "cog", "heading", "imonumber"])

In [31]:
# Função para remover outliers usando o método IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [32]:


def valores_vazios(col):
    # Corrigindo o nome do método para value_counts()
    percentual = col.value_counts(normalize=True).to_dict()
    percent = [percentual[key] for key in percentual]
    labels = [key for key in percentual]
    
    # Usando pd.Series em vez de pd.series e corrigindo a geração da série s
    s = pd.Series(np.random.choice(labels, p=percent, size=col.isnull().sum()), index=col.index[col.isnull()])
    
    # Preenchendo os valores NaN com a série s
    col.fillna(s, inplace=True)
    
    # Verificando se ainda há valores NaN e preenchendo-os com o valor mais frequente
    if col.isnull().sum() > 0:
        col.fillna(value=max(percentual, key=percentual.get), inplace=True)
        
    return col


In [33]:
def check_and_rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in df.columns[df.columns.duplicated(keep=False)]:  # identifica duplicatas
        cols[df.columns.get_loc(dup)] = [dup + '_' + str(d_idx) if d_idx != 0 else dup for d_idx in range(df.columns.get_loc(dup).sum())]
    df.columns = cols

In [34]:
# Aplicando a função para remover outliers de 'sog' e 'cog'
ais_data_1_cleaned = remove_outliers(ais_data_1_cleaned, 'sog')
ais_data_1_cleaned = remove_outliers(ais_data_1_cleaned, 'cog')
ais_data_2_cleaned = remove_outliers(ais_data_2_cleaned, 'sog')
ais_data_2_cleaned = remove_outliers(ais_data_2_cleaned, 'cog')



for col in ais_data_1_cleaned.columns[1:]:  # Ajuste conforme necessário
    if ais_data_1_cleaned[col].dtype == 'O':  # Verificando o tipo de dados da coluna
        ais_data_1_cleaned[col] = valores_vazios(ais_data_1_cleaned[col])
    else:
        # Para colunas numéricas, preencher com a mediana
        ais_data_1_cleaned[col].fillna(value=ais_data_1_cleaned[col].median(), inplace=True)
        

for col in ais_data_2_cleaned.columns[1:]:  # Ajuste conforme necessário
    if ais_data_2_cleaned[col].dtype == 'O':  # Verificando o tipo de dados da coluna
        ais_data_2_cleaned[col] = valores_vazios(ais_data_2_cleaned[col])
    else:
        # Para colunas numéricas, preencher com a mediana
        ais_data_2_cleaned[col].fillna(value=ais_data_2_cleaned[col].median(), inplace=True)

In [35]:
# Exibindo os primeiros registros para verificar a limpeza
# display(ais_data_1_cleaned.head(5))
# display(ais_data_2_cleaned.head(5))

In [36]:
# Identificando colunas numéricas e categóricas
numeric_features = ['sog', 'cog', 'length', 'width', 'draught']
categorical_features = ['shiptype', 'navigationalstatus']

# Criando transformadores para dados numéricos e categóricos
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Continuando com a preparação dos dados para o PyCaret
X = ais_data_1_cleaned[numeric_features + categorical_features]
y = ais_data_1_cleaned['navigationalstatus']  # Supondo que 'navigationalstatus' é a variável alvo
ais_data_1_cleaned_prepared = pd.concat([X, y], axis=1)

In [37]:
# Combinando transformadores em um ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Criando um pipeline com o pré-processador
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [39]:
check_and_rename_duplicate_columns(ais_data_1_cleaned_prepared)


# Supondo que 'navigationalstatus_1' é a variável alvo devido ao renomeamento
y = ais_data_1_cleaned['navigationalstatus']

# Assegure-se de não incluir a coluna alvo nas características
X = ais_data_1_cleaned[numeric_features + categorical_features]
ais_data_1_cleaned_prepared = pd.concat([X, y], axis=1)

# Atualizando a lista de características categóricas, removendo a coluna alvo
categorical_features = [feature for feature in categorical_features if feature != 'navigationalstatus']
categorical_features.append('navigationalstatus')  # Adicionando o novo nome se necessário

def check_and_rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for col in cols[cols.duplicated()].unique():  # Apenas para colunas duplicadas
        cs = [f"{col}_{i}" if i != 0 else col for i in range(cols[cols == col].count())]  # Renomear cada uma
        cols[cols[cols == col].index] = cs
    df.columns = cols


if ais_data_1_cleaned_prepared.columns.duplicated().sum() > 0:
    check_and_rename_duplicate_columns(ais_data_1_cleaned_prepared)




In [41]:
# Configurando o ambiente em PyCaret
setup(data=ais_data_1_cleaned_prepared, target='navigationalstatus_1', session_id=123,
      numeric_features=numeric_features, categorical_features=categorical_features,
      preprocess=False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,navigationalstatus_1
2,Target type,Multiclass
3,Original data shape,"(318334, 8)"
4,Transformed data shape,"(318334, 8)"
5,Transformed train set shape,"(222833, 8)"
6,Transformed test set shape,"(95501, 8)"
7,Numeric features,5
8,Categorical features,2


<pycaret.classification.oop.ClassificationExperiment at 0x2993bf0bbd0>

In [42]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.917,0.4,0.917,0.8409,0.8773,0.0,0.0,0.283
lightgbm,Light Gradient Boosting Machine,0.8835,0.5613,0.8835,0.8909,0.8853,0.2998,0.3072,6.653


In [43]:
# Ajustando o melhor modelo
tuned_model = tune_model(best_model)

# Avaliando o modelo ajustado
evaluate_model(tuned_model)

# Fazendo previsões no conjunto de teste
predictions = predict_model(tuned_model)

# Finalizando o modelo para salvar ou fazer mais previsões
final_model = finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
1,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
2,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
3,0.917,0.0,0.917,0.8409,0.8773,0.0,0.0
4,0.917,0.0,0.917,0.8409,0.8773,0.0,0.0
5,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
6,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
7,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
8,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
9,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0


Fitting 10 folds for each of 4 candidates, totalling 40 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Dummy Classifier,0.917,0,0.917,0.8409,0.8773,0.0,0.0


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# Salvando o modelo treinado
save_model(final_model, 'final_optimization_model')

# Se necessário, você pode integrar o modelo finalizado do PyCaret de volta ao seu pipeline do scikit-learn
final_sklearn_model = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', final_model)])

In [44]:
# Supondo que você esteja usando ais_data_2_cleaned como dados de entrada para previsão
# Primeiro, confirme que ais_data_2_cleaned está preparado corretamente:

if 'navigationalstatus' in ais_data_2_cleaned.columns:
    ais_data_2_cleaned.drop(columns=['navigationalstatus'], inplace=True)  # Remova a coluna alvo se existir

# Agora, certifique-se de que as colunas são as mesmas:
print("Colunas esperadas:", ais_data_1_cleaned.columns)
print("Colunas de teste:", ais_data_2_cleaned.columns)


Colunas esperadas: Index(['mmsi', 'navigationalstatus', 'sog', 'cog', 'heading', 'shiptype',
       'width', 'length', 'draught'],
      dtype='object')
Colunas de teste: Index(['cog', 'sog', 'beam', 'callsign', 'cargo', 'heading', 'imonumber',
       'latitude', 'length', 'longitude', 'mmsi', 'name', 'navstatus',
       'timeoffix', 'vesseltype', 'date'],
      dtype='object')


In [45]:
# Ajuste na chamada da função predict_model
predictions = predict_model(tuned_model, data=ais_data_2_cleaned)


KeyError: "['width', 'draught', 'shiptype', 'navigationalstatus'] not in index"

In [46]:
# Antes de usar predict_model, vamos assegurar que os índices são válidos
ais_data_2_cleaned.reset_index(drop=True, inplace=True)


In [47]:
try:
    predictions = predict_model(tuned_model, data=ais_data_2_cleaned)
    print(predictions.head())
except Exception as e:
    print("Ocorreu um erro durante a predição:", e)


Ocorreu um erro durante a predição: "['width', 'draught', 'shiptype', 'navigationalstatus'] not in index"
