In [1]:
import pandas as pd
import os
import csv

import warnings
warnings.simplefilter("ignore")

`path_23`: Caminho para a pasta com conteúdos da primeira exportação (período de 1 ano)

`path_24`: Caminho para a pasta com conteúdos da última exportação (período de 3 meses)

`path_export`: Caminho de saída de arquivos

Demais variáveis são combinações de strings para referenciar caminhos de arquivos

In [2]:
path_23 = os.path.join(os.getcwd(), "data", "raw", "01_23-01_24")
path_24 = os.path.join(os.getcwd(), "data", "raw", "01_24-03_24")

path_concorrentes_23 = path_23 + """/soujunior_extração_concorrentes_01_23-01_24.csv"""
path_conteudo_23 = path_23 + """/soujunior_extração_conteúdo_01_23-01_24.xlsx"""
path_seguidores_23 = path_23 + """/soujunior_extração_seguidores_01_23-01_24.xls"""
path_visitantes_23 = path_23 + """/soujunior_extração_visitantes.xlsx"""

path_concorrentes_24 = path_24 + "/concorrentes_01_24-03_24.xlsx"
path_conteudo_24 = path_24 + "/conteudo_01_24-03_24.xls"
path_seguidores_24 = path_24 + "/seguidores_01_24-03_24.xls"
path_visitantes_24 = path_24 + "/visitantes_01_24-03_24.xls"

path_export = os.path.join(os.getcwd(), "data", "processed")


`ultima_data_23`: Última data da primeira exportação (período de 1 ano)
`ultima_data_24`: Última data da última exportação

`concat_dfs`: Função para concatenar dataframes.

    Args: 
        df1, df2: Dataframes
        drop_duplicates: Recebe strings com nomes de colunas para remover duplicatas
        sort_values: Recebe strings com nomes de colunas para ordenar os dados
        date1, date2: Recebe datas para inserir em uma coluna "Date"
    
`english_cols`: Dict com nome da categoria e colunas em inglês

`transform_data`: Realiza o tratamento de dados de um dataframe
    
    Args:
        df: Dataframe
        category: string de referência para o tratamento de dados


In [3]:
# funções
# lendo as ultimas datas
ultima_data_23 = str(pd.read_excel(path_seguidores_23, sheet_name=0)["Data"].iloc[-1])
ultima_data_24 = str(pd.read_excel(path_seguidores_24, sheet_name=0)["Data"].iloc[-1])

print(f"ultima_data_23: {ultima_data_23}")
print(f"ultima_data_24: {ultima_data_24}")


def concat_dfs(
    df1,
    df2,
    drop_duplicates=False,
    sort_values="Date",
    date1=ultima_data_23,
    date2=ultima_data_24,
):
    if "Date" not in df1.columns:
        df1.insert(0, "Date", date1)
        df2.insert(0, "Date", date2)

    df = pd.concat([df1, df2])
    df["Date"] = pd.to_datetime(df["Date"])

    if drop_duplicates:
        df = df.drop_duplicates(subset=[drop_duplicates])

    df = df.sort_values(
        by=sort_values, ascending=False if sort_values == "Date" else True
    )

    return df


# def read_and_merge_dfs(df1, df2, date1=ultima_data_23, date2=ultima_data_24):
#     df1.insert(0, "Date", date1)
#     df2.insert(0, "Date", date2)

#     df_merged = pd.merge(left=df1, right=df2, how="outer")
#     df_merged = df_merged.sort_values(by=df_merged.columns[-1], ascending=False)
#     return df_merged


english_cols = {
    "content_metrics": [
        "Date",
        "Impressions (organic)",
        "Impressions (sponsored)",
        "Impressions",
        "Unique impressions (organic)",
        "Clicks (organic)",
        "Clicks (sponsored)",
        "Clicks",
        "Reactions (organic)",
        "Reactions (sponsored)",
        "Reactions",
        "Comments (organic)",
        "Comments (sponsored)",
        "Comments",
        "Shares (organic)",
        "Shares (sponsored)",
        "Shares",
        "Engagement rate (organic)",
        "Engagement rate (sponsored)",
        "Engagement rate",
    ],
    "content_posts": [
        "Post Title",
        "Post Link",
        "Post Type",
        "Campaign Name",
        "Published by",
        "Date",
        "Campaign Start Date",
        "Campaign End Date",
        "Audience",
        "Impressions",
        "Views (excluding off-site video views)",
        "Off-site Views",
        "Clicks",
        "Click-Through Rate (CTR)",
        "Likes",
        "Comments",
        "Shares",
        "Followers",
        "Engagement Rate",
        "Content Type",
    ],
    "followers_new": [
        "Date",
        "Followers Sponsored",
        "Followers Organic",
        "Total Followers",
    ],
    "followers_location": ["Location", "Total Followers"],
    "followers_function": ["Function", "Total Followers"],
    "followers_experience": ["Experience Level", "Total Followers"],
    "followers_industry": ["Industry", "Total Followers"],
    "followers_company_size": ["Company Size", "Total Followers"],
    "visitors_metrics": [
        "Date",
        "Page Views Overview (Desktop)",
        "Page Views Overview (Mobile Devices)",
        "Page Views Overview (Total)",
        "Unique Visitors Overview (Desktop)",
        "Unique Visitors Overview (Mobile Devices)",
        "Unique Visitors Overview (Total)",
        "Page Views Day by Day (Desktop)",
        "Page Views Day by Day (Mobile Devices)",
        "Page Views Day by Day (Total)",
        "Unique Visitors Day by Day (Desktop)",
        "Unique Visitors Day by Day (Mobile Devices)",
        "Unique Visitors Day by Day (Total)",
        "Page Views Jobs (Desktop)",
        "Page Views Jobs (Mobile Devices)",
        "Page Views Jobs (Total)",
        "Unique Visitors Jobs (Desktop)",
        "Unique Visitors Jobs (Mobile Devices)",
        "Unique Visitors Jobs (Total)",
        "Total Page Views (Desktop)",
        "Total Page Views (Mobile Devices)",
        "Total Page Views (Total)",
        "Total Unique Visitors (Desktop)",
        "Total Unique Visitors (Mobile Devices)",
        "Total Unique Visitors (Total)",
    ],
    "visitors_location": ["Location", "Total Views"],
    "visitors_function": ["Function", "Total Views"],
    "visitors_experience": ["Experience Level", "Total Views"],
    "visitors_industry": ["Industry", "Total Views"],
    "visitors_company_size": ["Company Size", "Total Views"],
    "competitors": [
        "Page",
        "Total Followers",
        "New Followers",
        "Total Post Engagements",
        "Total Posts",
    ],
}

def transform_data(df, category):
    if english_cols.get(category):
        df.columns = english_cols.get(category)
    else:
        return 0

    if category == "content_metrics":
        df = df[
            [
                "Date",
                "Impressions",
                "Clicks",
                "Reactions",
                "Comments",
                "Shares",
                "Engagement rate",
            ]
        ]

        df["Reactions (positive)"] = df["Reactions"][df["Reactions"] >= 0]
        df["Comments (positive)"] = df["Comments"][df["Comments"] >= 0]
        df["Shares (positive)"] = df["Shares"][df["Shares"] >= 0]
        df["Clicks (positive)"] = df["Clicks"][df["Clicks"] >= 0]

        df["Reactions (positive)"] = df["Reactions (positive)"].fillna(0)
        df["Comments (positive)"] = df["Comments (positive)"].fillna(0)
        df["Shares (positive)"] = df["Shares (positive)"].fillna(0)
        df["Clicks (positive)"] = df["Clicks (positive)"].fillna(0)

        window = 3

        df["Reactions (moving average)"] = (
            df["Reactions (positive)"].rolling(window=window).mean()
        )
        df["Comments (moving average)"] = (
            df["Comments (positive)"].rolling(window=window).mean()
        )
        df["Shares (moving average)"] = (
            df["Shares (positive)"].rolling(window=window).mean()
        )
        df["Clicks (moving average)"] = (
            df["Clicks (positive)"].rolling(window=window).mean()
        )

        df["Reactions"] = df.apply(
            lambda row: (
                row["Reactions (moving average)"]
                if row["Reactions"] < 0
                else row["Reactions"]
            ),
            axis=1,
        )

        df["Comments"] = df.apply(
            lambda row: (
                row["Comments (moving average)"]
                if row["Comments"] < 0
                else row["Comments"]
            ),
            axis=1,
        )

        df["Shares"] = df.apply(
            lambda row: (
                row["Shares (moving average)"] if row["Shares"] < 0 else row["Shares"]
            ),
            axis=1,
        )

        df["Clicks"] = df.apply(
            lambda row: (
                row["Clicks (moving average)"] if row["Clicks"] < 0 else row["Clicks"]
            ),
            axis=1,
        )

        df["Engagement Rate"] = df.apply(
            lambda row: (
                row["Reactions"] + row["Comments"] + row["Clicks"] + row["Shares"]
            )
            / row["Impressions"],
            axis=1,
        )

        df = df[
            [
                "Date",
                "Impressions",
                "Clicks",
                "Reactions",
                "Comments",
                "Shares",
                "Engagement Rate",
            ]
        ]

    return df


# def clean_and_transform(df, category):
#     if category == "content":

ultima_data_23: 01/16/2024
ultima_data_24: 03/24/2024


### Concatenação e Exportação para CSV

In [4]:
# conteudo
df_conteudo_metricas = concat_dfs(
    transform_data(
        pd.read_excel(path_conteudo_23, sheet_name=0, skiprows=1), "content_metrics"
    ),
    transform_data(
        pd.read_excel(path_conteudo_24, sheet_name=0, skiprows=1), "content_metrics"
    ),
)

df_conteudo_publicacoes = concat_dfs(
    transform_data(
        pd.read_excel(path_conteudo_23, sheet_name=1, skiprows=1), "content_posts"
    ),
    transform_data(
        pd.read_excel(path_conteudo_24, sheet_name=1, skiprows=1), "content_posts"
    ),
    drop_duplicates="Post Link",
)

df_conteudo_metricas.to_csv(
    path_export + "/conteudo_metricas.csv", index=False, quoting=csv.QUOTE_ALL
)
df_conteudo_publicacoes.to_csv(
    path_export + "/conteudo_publicacoes.csv", index=False, quoting=csv.QUOTE_ALL
)

In [5]:
# seguidores
df_seguidores_novos = concat_dfs(
    transform_data(pd.read_excel(path_seguidores_23, sheet_name=0), "followers_new"),
    transform_data(pd.read_excel(path_seguidores_24, sheet_name=0), "followers_new"),
    "Date",
)

df_seguidores_localidade = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=1), "followers_location"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=1), "followers_location"
    ),
)
df_seguidores_funcao = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=2), "followers_function"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=2), "followers_function"
    ),
)
df_seguidores_experiencia = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=3), "followers_experience"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=3), "followers_experience"
    ),
)
df_seguidores_setor = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=4), "followers_industry"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=4), "followers_industry"
    ),
)
df_seguidores_tamanho_empresa = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=5), "followers_company_size"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=5), "followers_company_size"
    ),
)

df_seguidores_novos.to_csv(
    path_export + "/seguidores_novos.csv", index=False, quoting=csv.QUOTE_ALL
)
df_seguidores_localidade.to_csv(
    path_export + "/seguidores_localidade.csv", index=False, quoting=csv.QUOTE_ALL
)
df_seguidores_funcao.to_csv(
    path_export + "/seguidores_funcao.csv", index=False, quoting=csv.QUOTE_ALL
)
df_seguidores_experiencia.to_csv(
    path_export + "/seguidores_experiencia.csv", index=False, quoting=csv.QUOTE_ALL
)
df_seguidores_setor.to_csv(
    path_export + "/seguidores_setor.csv", index=False, quoting=csv.QUOTE_ALL
)
df_seguidores_tamanho_empresa.to_csv(
    path_export + "/seguidores_tamanho_empresa.csv", index=False, quoting=csv.QUOTE_ALL
)

In [6]:
# visitantes
df_visitantes_metricas = concat_dfs(
    transform_data(pd.read_excel(path_visitantes_23, sheet_name=0), "visitors_metrics"),
    transform_data(pd.read_excel(path_visitantes_24, sheet_name=0), "visitors_metrics"),
    "Date",
)

df_visitantes_localidade = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=1), "visitors_location"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=1), "visitors_location"
    ),
)

df_visitantes_funcao = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=2), "visitors_function"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=2), "visitors_function"
    ),
)

df_visitantes_experiencia = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=3), "visitors_experience"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=3), "visitors_experience"
    ),
)

df_visitantes_setor = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=4), "visitors_industry"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=4), "visitors_industry"
    ),
)

df_visitantes_tamanho_empresa = concat_dfs(
    transform_data(
        pd.read_excel(path_seguidores_23, sheet_name=5), "visitors_company_size"
    ),
    transform_data(
        pd.read_excel(path_seguidores_24, sheet_name=5), "visitors_company_size"
    ),
)

df_visitantes_metricas.to_csv(
    path_export + "/visitantes_metricas.csv", index=False, quoting=csv.QUOTE_ALL
)

df_visitantes_localidade.to_csv(
    path_export + "/visitantes_localidade.csv", index=False, quoting=csv.QUOTE_ALL
)

df_visitantes_funcao.to_csv(
    path_export + "/visitantes_funcao.csv", index=False, quoting=csv.QUOTE_ALL
)

df_visitantes_experiencia.to_csv(
    path_export + "/visitantes_experiencia.csv", index=False, quoting=csv.QUOTE_ALL
)

df_visitantes_setor.to_csv(
    path_export + "/visitantes_setor.csv", index=False, quoting=csv.QUOTE_ALL
)

df_visitantes_tamanho_empresa.to_csv(
    path_export + "/visitantes_tamanho_empresa.csv", index=False, quoting=csv.QUOTE_ALL
)

In [7]:
# concorrentes
concorrentes_old = pd.read_csv(path_concorrentes_23, skiprows=1)
concorrentes_last = pd.read_excel(path_concorrentes_24, skiprows=1)

concorrentes_old = transform_data(concorrentes_old[list(concorrentes_last.columns)], category="competitors")
concorrentes_last = transform_data(concorrentes_last, category="competitors")

concorrentes = concat_dfs(concorrentes_old, concorrentes_last)
concorrentes.to_csv(path_export + "/concorrentes.csv", index=False)