# Importações


In [62]:
import urllib
from itertools import product
from os import getenv
from sqlalchemy import create_engine
from dotenv import load_dotenv

import numpy as np
import pandas as pd
from datetime import datetime, timedelta, time
import plotly.graph_objects as go
import plotly.express as px
from fuzzywuzzy import process
from enum import Enum

FERIADOS = pd.read_csv("../assets/feriados.csv")

### Colors


In [50]:
class BSColorsEnum(Enum):
    DANGER_COLOR = "#dc3545"
    WARNING_COLOR = "#ffc107"
    SUCCESS_COLOR = "#198754"
    GREY_500_COLOR = "#adb5bd"
    GREY_600_COLOR = "#6c757d"
    GREY_700_COLOR = "#495057"
    GREY_800_COLOR = "#343a40"
    GREY_900_COLOR = "#212529"
    PRIMARY_COLOR = "#0d6efd"
    SECONDARY_COLOR = "#6c757d"
    INFO_COLOR = "#0dcaf0"
    GRAY_COLOR = "#adb5bd"
    TEAL_COLOR = "#20c997"
    ORANGE_COLOR = "#fd7e14"
    INDIGO_COLOR = "#6610f2"
    PINK_COLOR = "#d63384"
    PURPLE_COLOR = "#6f42c1"
    GREY_400_COLOR = "#ced4da"
    SPACE_CADET_COLOR = "#282f44"
    BLUE_DELFT_COLOR = "#0d6efd"

# Database


## Conexão com o banco de dados


In [51]:
# database/connection.py

# cSpell: disable=invalid-name
load_dotenv()


class Connection:
    """
    Class Connection
    """

    def __init__(self):
        """
        Constructor

        Args:
            user (str): user
            password (str): password
            database (str): database
            driver (str): driver
            server (str): server

        Usage:
            >>> from connection import Connection
            >>> connection = Connection()
            >>> connection.get_connection()
        """
        self.__user = "bruno.thomaz"
        self.__password = ">gn68U@X@4o8"
        self.__database = "AUTOMACAO"
        self.__driver = "{ODBC Driver 17 for SQL Server}"
        self.__server = "srv-sqlserver"

    def get_connection_automacao(self):
        """
        Get connection

        Returns:
            object: connection

        Usage:
            >>> from connection import Connection
            >>> connection = Connection()
            >>> connection.get_connection()
        """
        try:
            params = urllib.parse.quote_plus(
                f"DRIVER={self.__driver};"
                f"SERVER={self.__server};"
                f"DATABASE={self.__database};"
                f"UID={self.__user};"
                f"PWD={self.__password};"
            )
            # pylint: disable=consider-using-f-string
            conexao_automacao = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
            return conexao_automacao
        # pylint: disable=broad-except
        except Exception as error:
            print(f"Error: {error}")
            return None

## Leitura do banco de dados


In [52]:
# database/db_read.py


# cSpell: disable=invalid-name
class Read(Connection):
    """
    Class Read
    Read data from the database and return a pandas dataframe
    Create query to be executed in the database
    """

    # pylint: disable=useless-super-delegation
    def __init__(self):
        """
        Constructor
        """
        super().__init__()

    def get_automacao_data(self, query: str) -> pd.DataFrame:
        """
        Get data from database AUTOMACAO and return a pandas dataframe.

        Parameters
        ----------
        query : str
            Query to be executed in the database

        Returns
        -------
        pandas dataframe
            Dataframe with the query result
        """
        try:
            connection = self.get_connection_automacao()
            data = pd.read_sql(query, connection)
            return data
        # pylint: disable=broad-except
        except Exception as error:
            print(f"Error: {error}")
            return None

    def create_automacao_query(self, table: str, where: str = None, orderby: str = None) -> str:
        """
        Create query to be executed in the database AUTOMACAO.

        Parameters
        ----------
        table : str
            Table name
        where : str
            Where clause (optional)
        orderby : str
            Order by clause (optional)

        Returns
        -------
        str
            Query to be executed in the database
        """
        query = f"SELECT * FROM AUTOMACAO.dbo.{table}"

        if where:
            query += f" WHERE {where}"

        if orderby:
            query += f" ORDER BY {orderby}"

        return query

## Query para o banco de dados


In [53]:
# // database/get_data.py
# cSpell: disable=invalid-name
class GetData:
    """
    Essa classe é responsável por realizar a leitura dos dados do banco de dados.
    É utilizada para fazer a leitura em segundo plano, sem que o usuário perceba.
    """

    def __init__(self):
        self.db_read = Read()

    def get_data(self) -> tuple:
        """
        Realiza a leitura dos dados do banco de dados.
        Retorna na ordem: df_occ, df_info, df_cadastro
        """

        # Dia de hoje
        now = pd.to_datetime("today")

        # Encontrando primeiro dia do mês atual
        first_day = now.replace(day=1)

        # Mantendo apenas a data
        first_day = first_day.strftime("%Y-%m-%d")

        # Query para leitura dos dados de ocorrência
        query_occ = self.db_read.create_automacao_query(
            table="maquina_ocorrencia",
            where=f"data_registro >= '{first_day}'",
        )

        query_info = (
            "SELECT"
            " t1.maquina_id,"
            " (SELECT TOP 1 t2.linha FROM AUTOMACAO.dbo.maquina_cadastro t2"
            " WHERE t2.maquina_id = t1.maquina_id AND t2.data_registro <= t1.data_registro"
            " ORDER BY t2.data_registro DESC, t2.hora_registro DESC) as linha,"
            " (SELECT TOP 1 t2.fabrica FROM AUTOMACAO.dbo.maquina_cadastro t2"
            " WHERE t2.maquina_id = t1.maquina_id AND t2.data_registro <= t1.data_registro"
            " ORDER BY t2.data_registro DESC, t2.hora_registro DESC) as fabrica,"
            " t1.status,"
            " t1.turno,"
            " t1.contagem_total_ciclos,"
            " t1.contagem_total_produzido,"
            " t1.data_registro,"
            " t1.hora_registro"
            " FROM "
            " AUTOMACAO.dbo.maquina_info t1"
            f" WHERE data_registro >= '{first_day}'"
            " ORDER BY t1.data_registro DESC, t1.hora_registro DESC"
        )

        query_production = (
            "WITH aux AS ("
            " SELECT"
            " t1.maquina_id,"
            " t1.turno,"
            " t1.contagem_total_ciclos,"
            " t1.contagem_total_produzido,"
            " (SELECT TOP 1 t2.linha FROM AUTOMACAO.dbo.maquina_cadastro t2"
            " WHERE t2.maquina_id = t1.maquina_id AND"
            " DATEADD(minute, -1, CAST(t2.data_registro AS DATETIME) +"
            " CAST(t2.hora_registro AS DATETIME)) <="
            " CAST(t1.data_registro AS DATETIME) + CAST(t1.hora_registro AS DATETIME)"
            " ORDER BY t2.data_registro DESC, t2.hora_registro desc) as linha,"
            " CASE"
            " WHEN CAST(t1.hora_registro AS TIME) <= '00:01'"
            " THEN DATEADD(day, -1, CAST(t1.data_registro AS DATETIME))"
            " ELSE CAST(t1.data_registro AS DATETIME)"
            " END as data_registro_aux,"
            " CAST(t1.hora_registro AS TIME) as hora_registro"
            " FROM"
            " AUTOMACAO.dbo.maquina_info t1"
            " ), aux2 AS ("
            " SELECT *,"
            " ROW_NUMBER() OVER (PARTITION BY maquina_id, turno, CAST(data_registro_aux AS DATE)"
            " ORDER BY ABS(DATEDIFF(minute, hora_registro,"
            " CASE turno WHEN 'NOT' THEN '07:59:59'"
            " WHEN 'MAT' THEN '15:59:59'"
            " WHEN 'VES' THEN '23:59:59' END))) as rn"
            " FROM aux"
            " )"
            " SELECT"
            " maquina_id,"
            " linha,"
            " turno,"
            " contagem_total_ciclos as total_ciclos,"
            " contagem_total_produzido as total_produzido,"
            " CAST(data_registro_aux AS DATE) as data_registro,"
            " hora_registro"
            " FROM"
            " aux2"
            f" WHERE rn = 1 AND data_registro_aux >= '{first_day}'  "
            " ORDER BY data_registro DESC, linha, hora_registro DESC"
        )

        print("========== Baixando dados do DB ==========")

        # Leitura dos dados
        df_occ = self.db_read.get_automacao_data(query_occ)
        df_info = self.db_read.get_automacao_data(query_info)
        df_info_production = self.db_read.get_automacao_data(query_production)

        # Verificando se os dados foram lidos corretamente
        if df_occ.empty or df_info.empty or df_info_production.empty:
            print("====== Erro na leitura dos dados ======")
            return None, None, None

        print("Ok...")

        return df_occ, df_info, df_info_production


get_data = GetData()
df_occ, df_info, df_info_production = get_data.get_data()



Ok...


## Testes de saída do banco de dados


In [54]:
df_occ.head(20)

Unnamed: 0,recno,maquina_id,motivo_id,problema,solucao,data_registro,hora_registro,usuario_id
0,3373,TMF005,3,,,2024-02-01,02:58:31,441
1,3374,TMF002,3,,,2024-02-01,02:58:41,441
2,3375,TMF015,3,,,2024-02-01,02:58:51,441
3,3376,TMF011,3,,,2024-02-01,02:59:00,441
4,3377,TMF014,3,,,2024-02-01,02:59:11,441
5,3378,TMF009,3,,,2024-02-01,02:59:22,441
6,3379,TMF003,3,,,2024-02-01,02:59:31,441
7,3380,TMF001,3,,,2024-02-01,02:59:42,441
8,3381,TMF013,3,,,2024-02-01,04:26:25,453
9,3382,TMF007,3,,,2024-02-01,04:26:33,453


In [55]:
df_info.head(20)

Unnamed: 0,maquina_id,linha,fabrica,status,turno,contagem_total_ciclos,contagem_total_produzido,data_registro,hora_registro
0,TMF009,14,2,False,VES,0.0,0.0,2024-02-24,16:24:39.173333
1,TMF007,13,2,False,VES,0.0,0.0,2024-02-24,16:24:38.170000
2,TMF012,12,2,False,VES,0.0,0.0,2024-02-24,16:24:37.166666
3,TMF013,11,2,False,VES,0.0,0.0,2024-02-24,16:24:36.166666
4,TMF008,10,2,False,VES,0.0,0.0,2024-02-24,16:24:35.166666
5,TMF009,14,2,False,VES,0.0,0.0,2024-02-24,16:22:39.163333
6,TMF007,13,2,False,VES,0.0,0.0,2024-02-24,16:22:38.163333
7,TMF012,12,2,False,VES,0.0,0.0,2024-02-24,16:22:37.160000
8,TMF013,11,2,False,VES,0.0,0.0,2024-02-24,16:22:36.163333
9,TMF008,10,2,False,VES,0.0,0.0,2024-02-24,16:22:35.160000


In [56]:
df_info_production.head(20)

Unnamed: 0,maquina_id,linha,turno,total_ciclos,total_produzido,data_registro,hora_registro
0,TMF005,1,NOT,0.0,0.0,2024-02-24,06:50:55.376666
1,TMF002,2,NOT,0.0,0.0,2024-02-24,06:50:56.376666
2,TMF015,3,NOT,0.0,0.0,2024-02-24,06:50:57.380000
3,TMF011,4,NOT,0.0,0.0,2024-02-24,06:50:58.383333
4,TMF003,5,NOT,0.0,0.0,2024-02-24,06:50:59.383333
5,TMF001,6,NOT,0.0,0.0,2024-02-24,06:51:00.383333
6,TMF006,7,NOT,0.0,0.0,2024-02-24,06:51:01.386666
7,TMF014,8,NOT,0.0,0.0,2024-02-24,06:51:02.386666
8,TMF004,9,NOT,0.0,0.0,2024-02-24,06:51:03.386666
9,TMF008,10,VES,0.0,0.0,2024-02-24,16:24:35.166666


# Limpeza de dados e análise exploratória


## Análise de dados - Clean Data


In [65]:
# service/clean_data.py


# cSpell: disable=invalid-name
class CleanData:
    def maq_info(self, info: pd.DataFrame) -> pd.DataFrame:
        """
        Processa as informações de uma máquina e retorna um DataFrame com os dados ajustados.

        Args:
            info (pd.DataFrame): DataFrame contendo as informações da máquina.

        Returns:
            pd.DataFrame: DataFrame com os dados ajustados da máquina.
        """

        # Ordenar dataframe
        df_info = info.sort_values(by=["maquina_id", "data_registro", "hora_registro", "turno"])

        # Criar coluna com data e hora unidos
        df_info["data_hora_registro"] = (
            df_info["data_registro"].astype(str)
            + " "
            + df_info["hora_registro"].astype(str).str.split(".").str[0]
        )

        # Ajustar primeira entrada se for VES
        mask = (df_info["turno"] == "VES") & (
            df_info["maquina_id"] != df_info["maquina_id"].shift()
        )
        df_info["turno"] = np.where(mask, "NOT", df_info["turno"])

        # Ajustar data_hora para pd.datetime
        df_info["data_hora_registro"] = pd.to_datetime(df_info["data_hora_registro"])

        # Ajustar horário se turno for VES - ajusta para dia anterior e horário 23:59:59
        mask = (
            (df_info["turno"] == "VES")
            & (df_info["data_hora_registro"] != df_info["data_hora_registro"].shift())
            & (df_info["data_hora_registro"].dt.time > time(0, 0, 0))
            & (df_info["data_hora_registro"].dt.time < time(0, 5, 0))
        )
        df_info["data_hora_registro"] = np.where(
            mask,
            (df_info["data_hora_registro"] - pd.Timedelta(days=1)).dt.normalize()
            + pd.Timedelta(hours=23, minutes=59, seconds=59),
            df_info["data_hora_registro"],
        )

        # Criar nova coluna status_change para identificar mudança de status
        df_info["status_change"] = df_info["status"].ne(df_info["status"].shift())

        # Criar coluna para identificar a mudança de máquina
        df_info["maquina_change"] = df_info["maquina_id"].ne(df_info["maquina_id"].shift())

        # Criar coluna para identificar a mudança de turno
        df_info["turno_change"] = df_info["turno"].ne(df_info["turno"].shift())

        # Atualizar coluna change para incluir mudança de turno
        df_info["change"] = (
            df_info["status_change"] | df_info["maquina_change"] | df_info["turno_change"]
        )

        # Agrupar por maquina e identificar data e hora da última mudança de status
        df_info["change_time"] = (
            df_info.groupby("maquina_id")["data_hora_registro"].shift(0).where(df_info["change"])
        )

        # Feito para agrupar por maquina_id e turno e manter o ultimo registro de cada grupo
        df_info = (
            df_info.groupby(["maquina_id", "change_time"])
            .agg(
                status=("status", "first"),
                turno=("turno", "first"),
                linha=("linha", "first"),
                fabrica=("fabrica", "first"),
                data_hora_registro=("data_hora_registro", "first"),
                contagem_total_ciclos=("contagem_total_ciclos", "last"),
                contagem_total_produzido=(
                    "contagem_total_produzido",
                    "last",
                ),
                change=("change", "first"),
                maquina_change=("maquina_change", "first"),
            )
            .reset_index()
        )

        # Criar nova coluna com a data_hora_final do status
        df_info["data_hora_final"] = (
            df_info.groupby("maquina_id")["data_hora_registro"]
            .shift(-1)
            .where(~df_info["maquina_change"])
        )

        # Atualizar coluna data_hora_final onde maquina_change é True
        mask = df_info["maquina_change"]
        df_info["data_hora_final"] = np.where(
            mask, df_info["change_time"].shift(-1), df_info["data_hora_final"]
        )

        # Remover colunas desnecessárias
        df_info.drop(
            columns=[
                "maquina_change",
                "change",
                "change_time",
            ],
            inplace=True,
        )

        # Remover linhas onde data_hora_final é nulo
        df_info.dropna(subset=["data_hora_final"], inplace=True)

        # Cria nova coluna tempo_registro_min para calcular o tempo de registro em minutos
        df_info["tempo_registro_min"] = (
            pd.to_datetime(df_info["data_hora_final"])
            - pd.to_datetime(df_info["data_hora_registro"])
        ).dt.total_seconds() / 60

        # Arredondar tempo_registro_min e converter para inteiro
        df_info["tempo_registro_min"] = df_info["tempo_registro_min"].round(0).astype(int)

        # Ajustar tipos
        df_info = df_info.astype(
            {
                "maquina_id": "category",
                "status": "category",
                "turno": "category",
                "linha": "category",
                "fabrica": "category",
                "tempo_registro_min": int,
                "contagem_total_ciclos": int,
                "contagem_total_produzido": int,
            }
        )

        # Ajustar nomenclatura dos status
        df_info["status"] = np.where(
            (df_info["status"] == "true") & (df_info["tempo_registro_min"] < 10),
            "in_test",
            df_info["status"],
        )
        df_info["status"] = np.where(df_info["status"] == "true", "rodando", df_info["status"])
        df_info["status"] = np.where(df_info["status"] == "false", "parada", df_info["status"])

        # Ajustar tipo do status
        df_info["status"] = df_info["status"].astype("category")

        # Ajustar o index
        df_info.reset_index(drop=True, inplace=True)

        return df_info

    def get_time_working(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Retorna os dados de maquina rodando.
        """

        info = self.maq_info(data)

        df_info_rodando = info[info["status"] == "rodando"]

        # Agrupar por linha, turno e data e somar o tempo de registro
        df_info_rodando = (
            df_info_rodando.groupby(
                ["linha", "turno", "status", df_info_rodando["data_hora_registro"].dt.date],
                observed=False,
            )
            .agg(tempo_registro_min=("tempo_registro_min", "sum"))
            .reset_index()
        )

        # Remover linhas onde tempo_registro_min é menor que 0
        df_info_rodando = df_info_rodando[(df_info_rodando["tempo_registro_min"] > 0)]

        # Remover onde a linha for 0
        df_info_rodando = df_info_rodando[df_info_rodando["linha"] != 0]

        # Renomear colunas
        df_info_rodando.rename(
            columns={
                "status": "motivo_nome",
                "data_hora_registro": "data_registro",
            },
            inplace=True,
        )

        # Capitalizar o motivo nome
        df_info_rodando["motivo_nome"] = df_info_rodando["motivo_nome"].str.capitalize()

        return df_info_rodando

    def get_adjusted_stops_data(self, info: pd.DataFrame) -> pd.DataFrame:
        """
        Retorna os dados de paradas ajustados de acordo com as regras definidas.

        Args:
            info (pd.DataFrame): O dataframe contendo os dados de paradas.

        Returns:
            pd.DataFrame: O dataframe com os dados de paradas ajustados.
        """
        # Certificar que data_hora_registro e data_hora_final são do tipo datetime
        info["data_hora_registro"] = pd.to_datetime(info["data_hora_registro"])
        info["data_hora_final"] = pd.to_datetime(info["data_hora_final"])

        # Ordenar por maquina_id e data_hora_registro
        df_info = info.sort_values(by=["maquina_id", "data_hora_registro"])

        # Criar coluna auxiliar para identificar a maquina rodando
        df_info["rodando"] = np.where(df_info["status"] == "rodando", 1, 0)

        # Unir grupos de paradas, levando em conta mudança de maquina e turno
        df_info["group"] = (
            (df_info["rodando"] != df_info["rodando"].shift())
            | (df_info["maquina_id"] != df_info["maquina_id"].shift())
            | (df_info["turno"] != df_info["turno"].shift())
            | (
                df_info["data_hora_registro"].dt.date
                != df_info["data_hora_registro"].shift().dt.date
            )
        ).cumsum()

        # Agrerar por grupo
        df_info = (
            df_info.groupby(["group"])
            .agg(
                maquina_id=("maquina_id", "first"),
                status=("status", "first"),
                turno=("turno", "first"),
                linha=("linha", "first"),
                fabrica=("fabrica", "first"),
                data_hora_registro=("data_hora_registro", "first"),
                data_hora_final=("data_hora_final", "last"),
                tempo_registro_min=("tempo_registro_min", "sum"),
                contagem_total_ciclos=("contagem_total_ciclos", "last"),
                contagem_total_produzido=("contagem_total_produzido", "last"),
            )
            .reset_index(drop=True)
        )

        # Alterar in_test para parada
        df_info["status"] = np.where(df_info["status"] == "in_test", "parada", df_info["status"])

        # Substituir valores nulos por np.nan
        df_info.fillna(value=np.nan, inplace=True)

        return df_info

    def dayofweek_adjust(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Incluir colunas para identificar sábados, domingos e feriados.

        Args:
            df (pd.DataFrame): DataFrame com os dados de paradas.

        Returns:
            pd.DataFrame: DataFrame com as colunas adicionadas.
        """

        # Garantir que data_hora_registro é do tipo datetime
        df["data_hora_registro"] = pd.to_datetime(df["data_hora_registro"])

        # Identificar sábados
        df["sabado"] = np.where(df["data_hora_registro"].dt.dayofweek == 5, 1, 0)

        # Identificar domingos
        df["domingo"] = np.where(df["data_hora_registro"].dt.dayofweek == 6, 1, 0)

        # Ler arquivo com os feriados
        holidays = FERIADOS

        # Converter para datetime
        holidays["feriados"] = pd.to_datetime(holidays["feriados"])

        # Identificar feriados
        df["feriado"] = (
            df["data_hora_registro"]
            .dt.date.isin(pd.to_datetime(holidays["feriados"]).dt.date)
            .astype(int)
        )

        # Ordenar valores pela linha e data_hora_registro
        df = df.sort_values(by=["linha", "data_hora_registro"])

        # Remover se a linha for 0
        df = df[df["linha"] != 0]

        # Ordenar Colunas
        df = df.reindex(
            columns=[
                "fabrica",
                "linha",
                "maquina_id",
                "status",
                "turno",
                "tempo_registro_min",
                "contagem_total_produzido",
                "contagem_total_ciclos",
                "data_hora_registro",
                "data_hora_final",
                "sabado",
                "domingo",
                "feriado",
            ]
        )

        # Ajustar o index
        df.reset_index(drop=True, inplace=True)

        return df

    def get_maq_info_cleaned(self, df_info: pd.DataFrame) -> pd.DataFrame:
        """
        Retorna os dados de paradas ajustados de acordo com as regras definidas.

        Args:
            df_info (pd.DataFrame): O dataframe contendo os dados de paradas.

        Returns:
            pd.DataFrame: O dataframe com os dados de paradas ajustados.
        """

        # Ajustar dados de maquina_info
        df_info = self.maq_info(df_info)

        # Ajustar dados de paradas
        df_info = self.get_adjusted_stops_data(df_info)

        # Incluir colunas para identificar sábados, domingos e feriados
        df_info = self.dayofweek_adjust(df_info)

        return df_info

    def get_maq_occ_cleaned(self, df_occ: pd.DataFrame) -> pd.DataFrame:
        """
        Retorna os dados de ocorrências ajustados de acordo com as regras definidas.

        Args:
            df_occ (pd.DataFrame): O dataframe contendo os dados de ocorrências.

        Returns:
            pd.DataFrame: O dataframe com os dados de ocorrências ajustados.
        """

        # Motivos de Parada
        motivos = {
            1: "Ajustes",
            2: "Troca de Bobina",
            3: "Refeição",
            4: "Reunião",
            5: "Café e Ginástica Laboral",
            6: "Limpeza",
            7: "Manutenção Elétrica",
            8: "Manutenção Mecânica",
            9: "Material em Falta",
            10: "Setup de Sabor",
            11: "Setup de Tamanho",
            12: "Parada Programada",
            13: "Intervenção de Qualidade",
            14: "Linha Cheia",
            15: "Treinamento",
            16: "Limpeza Industrial",
            17: "Troca de Filme",
        }

        # Ajustar coluna motivo_id para int
        df_occ = df_occ.astype({"motivo_id": int})

        # Unir data_registro e hora_registro
        df_occ["data_hora_registro"] = (
            df_occ["data_registro"].astype(str)
            + " "
            + df_occ["hora_registro"].astype(str).str.split(".").str[0]
        )

        # Ajustar data_hora_registro para datetime
        df_occ["data_hora_registro"] = pd.to_datetime(df_occ["data_hora_registro"])

        # Criar coluna com motivo_nome com base no dicionário motivos
        df_occ["motivo_nome"] = df_occ["motivo_id"].map(motivos)

        # Ajustar "problema" e "solucao" se a string estiver vazia
        df_occ["problema"] = np.where(df_occ["problema"] == "", np.nan, df_occ["problema"])
        df_occ["solucao"] = np.where(df_occ["solucao"] == "", np.nan, df_occ["solucao"])

        # Copiar motivo_nome para problema caso problema seja nulo e motivo_id não seja 1,7,8,9,14
        df_occ["problema"] = np.where(
            (df_occ["problema"].isnull())
            & (
                ~df_occ["motivo_id"].isin(
                    [
                        1,
                        7,
                        8,
                        9,
                        14,
                    ]
                )
            ),
            df_occ["motivo_nome"],
            df_occ["problema"],
        )

        # Ajustar ordem das colunas e seus tipos
        df_occ = df_occ.reindex(
            columns=[
                "maquina_id",
                "motivo_id",
                "motivo_nome",
                "problema",
                "solucao",
                "data_hora_registro",
                "usuario_id",
            ]
        )
        df_occ = df_occ.astype(
            {
                "maquina_id": "category",
                "motivo_id": int,
                "motivo_nome": "category",
                "problema": str,
                "solucao": "category",
                "data_hora_registro": "datetime64[ns]",
                "usuario_id": "category",
            }
        )

        return df_occ

    def get_maq_production_cleaned(self, df_production: pd.DataFrame) -> pd.DataFrame:
        """
        Retorna os dados de produção ajustados de acordo com as regras definidas.

        Args:
            df_production (pd.DataFrame): O dataframe contendo os dados de produção.

        Returns:
            pd.DataFrame: O dataframe com os dados de produção ajustados.
        """

        # Incluir coluna turno_number para ordenar os turnos
        df_production["turno_number"] = df_production["turno"].map({"MAT": 2, "VES": 3, "NOT": 1})

        # Ordenar por maquina_id, data_registro e turno_number
        df_production.sort_values(by=["linha", "data_registro", "turno_number"], inplace=True)

        # Remover coluna turno_number
        df_production.drop(columns=["turno_number"], inplace=True)

        # Ajustar tipos
        df_production = df_production.astype(
            {
                "maquina_id": "category",
                "linha": "category",
                "turno": "category",
                "total_ciclos": int,
                "total_produzido": int,
                "data_registro": "datetime64[ns]",
            }
        )

        # Remover onde linha for 0
        df_production = df_production[df_production["linha"] != 0]

        # Ajustar o index
        df_production.reset_index(drop=True, inplace=True)

        return df_production


clean_data = CleanData()

## Análise de dados - Retorno de Dados


In [66]:
df_info_clean = clean_data.maq_info(df_info.copy())
df_info_clean.head(28)

Unnamed: 0,maquina_id,status,turno,linha,fabrica,data_hora_registro,contagem_total_ciclos,contagem_total_produzido,data_hora_final,tempo_registro_min
0,TMF001,rodando,NOT,9,1,2024-02-01 00:01:38,12,12,2024-02-01 01:57:38,116
1,TMF001,parada,NOT,9,1,2024-02-01 01:57:38,2276,2266,2024-02-01 01:59:38,2
2,TMF001,rodando,NOT,9,1,2024-02-01 01:59:38,2304,2292,2024-02-01 02:57:38,58
3,TMF001,parada,NOT,9,1,2024-02-01 02:57:38,3462,3436,2024-02-01 04:03:38,66
4,TMF001,rodando,NOT,9,1,2024-02-01 04:03:38,3476,3448,2024-02-01 05:59:39,116
5,TMF001,parada,NOT,9,1,2024-02-01 05:59:39,5738,5702,2024-02-01 06:09:39,10
6,TMF001,rodando,NOT,9,1,2024-02-01 06:09:39,5766,5730,2024-02-01 07:51:39,102
7,TMF001,parada,NOT,9,1,2024-02-01 07:51:39,7766,7680,2024-02-01 07:53:39,2
8,TMF001,in_test,NOT,9,1,2024-02-01 07:53:39,7780,7692,2024-02-01 08:01:39,8
9,TMF001,rodando,MAT,9,1,2024-02-01 08:01:39,14,14,2024-02-01 09:29:39,88


In [67]:
df_info_clean = clean_data.maq_info(df_info.copy())
df_info_clean.head(28)

Unnamed: 0,maquina_id,status,turno,linha,fabrica,data_hora_registro,contagem_total_ciclos,contagem_total_produzido,data_hora_final,tempo_registro_min
0,TMF001,rodando,NOT,9,1,2024-02-01 00:01:38,12,12,2024-02-01 01:57:38,116
1,TMF001,parada,NOT,9,1,2024-02-01 01:57:38,2276,2266,2024-02-01 01:59:38,2
2,TMF001,rodando,NOT,9,1,2024-02-01 01:59:38,2304,2292,2024-02-01 02:57:38,58
3,TMF001,parada,NOT,9,1,2024-02-01 02:57:38,3462,3436,2024-02-01 04:03:38,66
4,TMF001,rodando,NOT,9,1,2024-02-01 04:03:38,3476,3448,2024-02-01 05:59:39,116
5,TMF001,parada,NOT,9,1,2024-02-01 05:59:39,5738,5702,2024-02-01 06:09:39,10
6,TMF001,rodando,NOT,9,1,2024-02-01 06:09:39,5766,5730,2024-02-01 07:51:39,102
7,TMF001,parada,NOT,9,1,2024-02-01 07:51:39,7766,7680,2024-02-01 07:53:39,2
8,TMF001,in_test,NOT,9,1,2024-02-01 07:53:39,7780,7692,2024-02-01 08:01:39,8
9,TMF001,rodando,MAT,9,1,2024-02-01 08:01:39,14,14,2024-02-01 09:29:39,88


In [68]:
df_info_cleaned = clean_data.get_maq_info_cleaned(df_info.copy())
df_info_cleaned.head(28)

Unnamed: 0,fabrica,linha,maquina_id,status,turno,tempo_registro_min,contagem_total_produzido,contagem_total_ciclos,data_hora_registro,data_hora_final,sabado,domingo,feriado
0,1,1,TMF005,parada,NOT,2,0,0,2024-02-01 00:01:30,2024-02-01 00:03:30,0,0,0
1,1,1,TMF005,rodando,NOT,94,28,28,2024-02-01 00:03:30,2024-02-01 01:37:30,0,0,0
2,1,1,TMF005,parada,NOT,2,1988,1996,2024-02-01 01:37:30,2024-02-01 01:39:30,0,0,0
3,1,1,TMF005,rodando,NOT,80,2024,2032,2024-02-01 01:39:30,2024-02-01 02:59:30,0,0,0
4,1,1,TMF005,parada,NOT,64,3690,3712,2024-02-01 02:59:30,2024-02-01 04:03:30,0,0,0
5,1,1,TMF005,rodando,NOT,64,3722,3744,2024-02-01 04:03:30,2024-02-01 05:07:31,0,0,0
6,1,1,TMF005,parada,NOT,2,5078,5100,2024-02-01 05:07:31,2024-02-01 05:09:31,0,0,0
7,1,1,TMF005,rodando,NOT,46,5104,5128,2024-02-01 05:09:31,2024-02-01 05:55:31,0,0,0
8,1,1,TMF005,parada,NOT,8,6052,6080,2024-02-01 05:55:31,2024-02-01 06:03:31,0,0,0
9,1,1,TMF005,rodando,NOT,118,6056,6084,2024-02-01 06:03:31,2024-02-01 08:01:31,0,0,0
