In [51]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import plotly.express as px
import seaborn as sn
import time
import os
# !pip install -U kaleido

In [3]:
data = pd.read_csv('data/arribos_bcs.csv', encoding="latin1")

In [32]:
def time_elapsed(function, *args):
    start = time.perf_counter()
    function(*args)
    end = time.perf_counter()
    print(f"Time elapsed: {end - start:0.4f} seconds")

In [5]:
def cleansing(df):
    aux = df.copy()
    # Drop duplicates
    aux.drop_duplicates(inplace=True)
    
    # Column name standarization
    aux.columns = aux.columns.str.strip() # Leading and trailing
    aux.columns = aux.columns.str.lower()
    aux.columns = aux.columns.str.replace(" ", "_")
    aux.columns
    for c in aux.columns:
        aux[c] = aux[c].replace(",", "")


    # Change to numeric and to datetime when needed
    l_num = ["peso_vivo", "peso_desembarcado", "precio", "valor"]
    l_date = ["fecha_aviso", "fecha_expedicion", "fecha_vigencia", "periodo_inicio", "periodo_fin"] # ¡revisar fechas en otro formato!
    for col in l_num:
        aux[col] = pd.to_numeric(aux[col].str.replace(",",""))
    for col in l_date:
        aux[col] = pd.to_datetime(aux[col], infer_datetime_format=True, errors='coerce')

    # Add month, year and year_month as columns
    aux["month"] = aux.periodo_fin.dt.month
    aux["year"] = aux.periodo_fin.dt.year

    aux["year_month"] = aux.apply(lambda x: str(x["year"]).replace(".0","") + "-" + str(x["month"]).replace(".0","")
                                  if (x["month"] != np.nan) and (x["year"] != np.nan)
                                  else np.nan, axis=1)

    return aux

In [7]:
df = cleansing(data)

In [39]:
def viz_distribution(df):
    if not os.path.exists('viz/distribution'):
        os.mkdir('viz/distribution')
    # Histograms and bar plots for numeric variables
    interq_range = [.25,.75]
    for col in list(df.columns):
        if is_numeric_dtype(df[col]):
            qs = [np.quantile(df[col], interq_range[0]), np.quantile(df[col], interq_range[1])]
            dist = np.quantile(df[col], interq_range[0]) - np.quantile(df[col], interq_range[1])
            fences = [qs[0] - (1.5*dist), qs[1] + (1.5*dist)]
            aux = df[(df[col] > qs[0]) & (df[col] < qs[1])]
            if is_numeric_dtype(aux[col]):
                fig = px.histogram(aux, x=col, title="Distribución de "+col)
                fig.write_image("viz/distribution/"+col+"_histogram.png") 
                fig = px.box(aux, y=col, title="Distribución de "+col+" sin outliers")
                fig.write_image("viz/distribution/"+col+"_box_plot_wo_outliers.png") 
                fig = px.box(df, y=col, title="Distribución de "+col+" con outliers")
                fig.write_image("viz/distribution/"+col+"_box_plot_w_outliers.png") 

In [58]:
def viz_time_series(df, cycle="year_month", date_column="periodo_fin", from_="2010", to_="2023", y="peso_desembarcado"):
    if not os.path.exists('viz/time_series'):
        os.mkdir('viz/time_series')
    cycle = cycle
    date_column = date_column
    aux = df.copy()
    mediana = aux[y].median()

    aux = aux[(aux[fecha] > from_) & (aux[fecha] <= to_)]
    aux = aux[aux[y] < mediana]

    aux = aux.groupby(cycle).mean()[y]
    aux = pd.DataFrame(aux).reset_index()
    
    # Time series: y as a function of time
    fig = px.bar(aux, x=cycle, y=y, 
                 title="Promedio de "+ y +" por "+cycle,
                 labels={cycle:"Fecha", y:"Promedio de peso (kg)"})
    fig.update_yaxes(range = [60,110])

    fig.write_image("viz/time_series/"+y+"_time_series_"+cycle+".png") 


In [80]:
# PENDIENTE
# Exploration of categories
def viz_categories(df, category="especie"):
    if not os.path.exists('viz/categories'):
        os.mkdir('viz/categories')

    # Análisis por nombre (debemos verificar que cada especie diferente registrada es única, mientras usaremos el nombre para agrupar):
    aux = df
    aux.drop(aux[aux["year_month"] == "nan-nan"].index, inplace = True)
    aux.anio_mes_fin = pd.to_datetime(aux.year_month)
    aux = aux[(aux[fecha] > '2010') & (aux[fecha] <= '2023')]

    aux = aux.groupby(["year_month", "nombre_principal"]).mean()["peso_desembarcado"]
    aux = pd.DataFrame(aux).reset_index().sort_values("nombre_principal").sort_values("year_month")

    l_nombre_principal = sorted(set(aux.nombre_principal))

    fig = px.line(aux, x="year_month", y="peso_desembarcado", color='nombre_principal', 
                 category_orders={"nombre_principal": l_nombre_principal},
                 title="Promedio de pesos desembarcados por especie",
                 labels={"peso_desembarcado":"Promedio peso desembarcado (kg)", "year_month":"Fecha"})
    fig.write_image("viz/categories/por_"+category+".png") 
    
    # Comentaios: Incluso con sólo 50 nombres el análisis temporal por especie se satura, debemos encontrar otra manera de 
    # analizar cada especie


In [75]:
aux = df[df["peso_desembarcado"] >= 30000]
print(aux[["nombre_lugarcaptura", "nombre_sitio_desembarque", "numero_embarcaciones"]])
print(sorted(set(aux.nombre_lugarcaptura)))
x_especie(aux)

               nombre_lugarcaptura nombre_sitio_desembarque  \
27221                     ENSENADA                 ENSENADA   
31991  SAN ISIDRO (EJIDO ERENDIRA)        PUERTO SAN ISIDRO   
50045                   LA CHORERA              BAHIA FALSA   
50547                   LA CHORERA              BAHIA FALSA   
69183                  PUNTA CABRA              SAN QUINTIN   
...                            ...                      ...   
95319                    EL MUERTO               SAN FELIPE   
95332                    EL MUERTO               SAN FELIPE   
95333               PUNTA ESTRELLA     MUELLE DE SAN FELIPE   
95739               PUNTA ESTRELLA     MUELLE DE SAN FELIPE   
95950               PUNTA ESTRELLA     MUELLE DE SAN FELIPE   

       numero_embarcaciones  
27221                     0  
31991                    12  
50045                     0  
50547                     0  
69183                     0  
...                     ...  
95319                    18  
953



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



In [None]:
aux = df[df["peso_desembarcado"] < 1000]
print(aux[["nombre_lugarcaptura", "nombre_sitio_desembarque", "numero_embarcaciones"]])
print(sorted(set(aux.nombre_lugarcaptura)))
x_especie(aux)

In [84]:
# Time
print("Runs every time we get new data on arrivals:")
print(time_elapsed(cleansing, data), "data shape:", data.shape,"\n")
print("Runs every time we get new data on arrivals:")
print(time_elapsed(viz_distribution, df), "data size:", data.shape,"\n")
print("Runs every time we get new data on arrivals:")
print(time_elapsed(viz_time_series, df,("year_month")),"data size:", data.shape,"\n")
print("Runs every time we get new data on arrivals:")
print(time_elapsed(viz_categories, df),"data size:", data.shape,"\n")

Runs every time we get new data on arrivals:
Time elapsed: 3.6947 seconds
None data shape: (97455, 35) 

Runs every time we get new data on arrivals:
Time elapsed: 10.5636 seconds
None data size: (97455, 35) 

Runs every time we get new data on arrivals:
Time elapsed: 0.1224 seconds
None data size: (97455, 35) 

Runs every time we get new data on arrivals:
Time elapsed: 1.1180 seconds
None data size: (97455, 35) 

