In [16]:

import pandas as pd
import numpy as np

from pycaret.classification import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Carregando os dados
ais_data_1 = pd.read_csv(r'..\Data\ais_data.csv')
ais_data_2 = pd.read_csv(r'..\Data\AIS_UNACORN_Seatracks_past12-hours.csv')

# Removendo colunas desnecessárias e tratando valores ausentes
ais_data_1_cleaned = ais_data_1.drop(columns=["Unnamed: 0"]).dropna(subset=["sog", "cog", "heading", "shiptype"])
ais_data_2_cleaned = ais_data_2.drop(columns=["Unnamed: 0"]).dropna(subset=["sog", "cog", "heading", "imonumber"])

In [3]:
# Função para remover outliers usando o método IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [11]:


def valores_vazios(col):
    # Corrigindo o nome do método para value_counts()
    percentual = col.value_counts(normalize=True).to_dict()
    percent = [percentual[key] for key in percentual]
    labels = [key for key in percentual]
    
    # Usando pd.Series em vez de pd.series e corrigindo a geração da série s
    s = pd.Series(np.random.choice(labels, p=percent, size=col.isnull().sum()), index=col.index[col.isnull()])
    
    # Preenchendo os valores NaN com a série s
    col.fillna(s, inplace=True)
    
    # Verificando se ainda há valores NaN e preenchendo-os com o valor mais frequente
    if col.isnull().sum() > 0:
        col.fillna(value=max(percentual, key=percentual.get), inplace=True)
        
    return col


In [4]:
# Aplicando a função para remover outliers de 'sog' e 'cog'
ais_data_1_cleaned = remove_outliers(ais_data_1_cleaned, 'sog')
ais_data_1_cleaned = remove_outliers(ais_data_1_cleaned, 'cog')
ais_data_2_cleaned = remove_outliers(ais_data_2_cleaned, 'sog')
ais_data_2_cleaned = remove_outliers(ais_data_2_cleaned, 'cog')




In [12]:
# Supondo que data_clean seja o seu DataFrame

for col in ais_data_1_cleaned.columns[1:]:  # Ajuste conforme necessário
    if ais_data_1_cleaned[col].dtype == 'O':  # Verificando o tipo de dados da coluna
        ais_data_1_cleaned[col] = valores_vazios(ais_data_1_cleaned[col])
    else:
        # Para colunas numéricas, preencher com a mediana
        ais_data_1_cleaned[col].fillna(value=ais_data_1_cleaned[col].median(), inplace=True)

In [13]:
# Supondo que data_clean seja o seu DataFrame

for col in ais_data_2_cleaned.columns[1:]:  # Ajuste conforme necessário
    if ais_data_2_cleaned[col].dtype == 'O':  # Verificando o tipo de dados da coluna
        ais_data_2_cleaned[col] = valores_vazios(ais_data_2_cleaned[col])
    else:
        # Para colunas numéricas, preencher com a mediana
        ais_data_2_cleaned[col].fillna(value=ais_data_2_cleaned[col].median(), inplace=True)

In [15]:
# Exibindo os primeiros registros para verificar a limpeza
# display(ais_data_1_cleaned.head(5))
# display(ais_data_2_cleaned.head(5))

# ----------------------------------------------------------------

In [22]:
# import pandas as pd

# # Suponha que 'ais_data_1_cleaned_prepared' é o DataFrame que você está tentando usar
# def check_and_rename_duplicate_columns(df):
#     cols = pd.Series(df.columns)
#     for dup in df.columns[df.columns.duplicated(keep=False)]:  # identifica duplicatas
#         cols[df.columns.get_loc(dup)] = [dup + '_' + str(d_idx) if d_idx != 0 else dup for d_idx in range(df.columns.get_loc(dup).sum())]
#     df.columns = cols

# Aplicando a função para garantir que não há colunas duplicadas
check_and_rename_duplicate_columns(ais_data_1_cleaned_prepared)

# Agora, continue com a configuração do PyCaret
from pycaret.classification import setup, compare_models

setup(data=ais_data_1_cleaned_prepared, target='navigationalstatus', session_id=123,
      numeric_features=['sog', 'cog', 'length', 'width', 'draught'],
      categorical_features=['shiptype'],
      preprocess=False)  # Desativar o pré-processamento automático já que usamos scikit-learn para isso

# Comparando modelos para encontrar o melhor
best_model = compare_models()


Unnamed: 0,Description,Value
0,Session id,123
1,Target,navigationalstatus
2,Target type,Multiclass
3,Original data shape,"(318334, 8)"
4,Transformed data shape,"(318334, 8)"
5,Transformed train set shape,"(222833, 8)"
6,Transformed test set shape,"(95501, 8)"
7,Numeric features,5
8,Categorical features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.917,0.4,0.917,0.8409,0.8773,0.0,0.0,0.172
lightgbm,Light Gradient Boosting Machine,0.8698,0.5386,0.8698,0.886,0.8757,0.2513,0.2581,4.883


In [21]:



# Configurando o ambiente em PyCaret
setup(data=ais_data_1_cleaned_prepared, target='navigationalstatus', session_id=123,
      numeric_features=numeric_features, categorical_features=categorical_features,
      preprocess=False)  # Desativar o pré-processamento automático já que usamos scikit-learn para isso

# Comparando modelos para encontrar o melhor
best_model = compare_models()


ValueError: Duplicate column names found in X.

In [17]:
# Identificando colunas numéricas e categóricas
numeric_features = ['sog', 'cog', 'length', 'width', 'draught']
categorical_features = ['shiptype', 'navigationalstatus']

# Criando transformadores para dados numéricos e categóricos
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [18]:
# Combinando transformadores em um ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Criando um pipeline com o pré-processador
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [19]:
# Preparando dados para PyCaret
X = ais_data_1_cleaned[numeric_features + categorical_features]
y = ais_data_1_cleaned['navigationalstatus']  # Supondo que 'navigationalstatus' é a variável alvo
ais_data_1_cleaned_prepared = pd.concat([X, y], axis=1)


In [20]:
# Configurando o ambiente em PyCaret
setup(data=ais_data_1_cleaned_prepared, target='navigationalstatus', session_id=123,
      numeric_features=numeric_features, categorical_features=categorical_features,
      preprocess=False)  # Desativar o pré-processamento automático já que usamos scikit-learn para isso

# Comparando modelos para encontrar o melhor
best_model = compare_models()




ValueError: Duplicate column names found in X.

In [23]:
# Ajustando o melhor modelo
tuned_model = tune_model(best_model)

# Avaliando o modelo ajustado
evaluate_model(tuned_model)

# Fazendo previsões no conjunto de teste
predictions = predict_model(tuned_model)

# Finalizando o modelo para salvar ou fazer mais previsões
final_model = finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
1,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
2,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
3,0.917,0.0,0.917,0.8409,0.8773,0.0,0.0
4,0.917,0.0,0.917,0.8409,0.8773,0.0,0.0
5,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
6,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
7,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
8,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0
9,0.917,0.5,0.917,0.8409,0.8773,0.0,0.0


Fitting 10 folds for each of 4 candidates, totalling 40 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Dummy Classifier,0.917,0,0.917,0.8409,0.8773,0.0,0.0


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:


# Salvando o modelo treinado
save_model(final_model, 'final_optimization_model')

# Se necessário, você pode integrar o modelo finalizado do PyCaret de volta ao seu pipeline do scikit-learn
final_sklearn_model = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', final_model)])