## Bibliotecas

In [0]:
import pyspark.sql.functions as F
import pyspark.pandas as ps
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image, display
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [0]:
palette = {"#F72585","#4CC9F0","#7209B7", "#3A0CA3"}
sns.set_palette(palette)

In [0]:
base_path = 'figures'

In [0]:
def grafico_chrun_vs_x(data:pd.DataFrame ,x:list ):
    fig = plt.figure(figsize=(8,5))
    ax = sns.barplot(data=data, x=x, y='total', hue='Churn', palette=palette)

    for p in ax.patches:
        height = p.get_height()
        ax.annotate(f'{int(height)}', xy=(p.get_x() + p.get_width() / 2, height),
                    xytext=(0, 1), textcoords='offset points',
                    ha='center', va='bottom', fontsize=10, color='black')

    plt.title(f'{x} x Churn')
    plt.xlabel(x)
    plt.ylabel("Quantidade de Churns")
    plt.legend(title="Churn", bbox_to_anchor=(1.05, 1), loc=2)
    
    return fig

## Importando dataframe

In [0]:
%sql

SELECT * FROM telecom.silver.teleco_cleaned
LIMIT 10;

# Analise 1 - Grupos de clientes

vamos verificar a relação dos tipos de cliente com a variável alvo

(gender | SeniorCitizen | Partner | Dependents |)  VS  chrun

## Gender x Chrun

In [0]:
%sql

    SELECT 
    gender,
    SUM(CASE WHEN churn = 'Yes' THEN 1 ELSE 0 END) AS churned,
    SUM(CASE WHEN churn = 'No' THEN 1 ELSE 0 END) AS not_churned
    FROM telecom.silver.teleco_cleaned
    GROUP BY gender
    ORDER BY gender;

In [0]:
df_churn_by_gender = spark.sql("""
                               
    SELECT 
    gender,
    SUM(CASE WHEN churn = 'Yes' THEN 1 ELSE 0 END) AS Yes,
    SUM(CASE WHEN churn = 'No' THEN 1 ELSE 0 END) AS No
    FROM telecom.silver.teleco_cleaned
    GROUP BY gender
    ORDER BY gender;


"""
).toPandas()

unpivot no dataframe, para conseguirmos extrair o quantidade churn e não churn por gênero.

In [0]:
df_churn_by_gender = df_churn_by_gender.melt(
    id_vars='gender',
    value_vars=['Yes','No'],
    var_name='Churn',value_name='total')

print(df_churn_by_gender)

In [0]:
fig_churn_gen = grafico_chrun_vs_x(df_churn_by_gender,'gender')
fig_churn_gen.savefig(base_path+'/churn_by_gender.png', format = 'png')
plt.close()
display(Image(filename=base_path + '/churn_by_gender.png'))

## SeniorCitizen x Churn

In [0]:
df_churn_by_senior = spark.sql("""
                               
       SELECT SeniorCitizen,
       SUM(CASE WHEN churn = 'Yes' THEN 1 ELSE 0 END) AS Yes,
       SUM(CASE WHEN churn = 'No' THEN 1 ELSE 0 END )AS No
       FROM telecom.silver.teleco_cleaned
       GROUP BY SeniorCitizen
"""
).toPandas()

In [0]:
df_churn_by_senior = df_churn_by_senior.melt(id_vars = 'SeniorCitizen' , value_vars=['Yes','No'], var_name='Churn', value_name='total')


In [0]:
grafico_chrun_vs_x(df_churn_by_senior,'SeniorCitizen')

## Partner x Chrun

In [0]:
%sql

    SELECT Partner,
    SUM (CASE WHEN churn = 'Yes' THEN 1 ELSE 0 END) AS Yes,
    SUM (CASE WHEN churn = 'No' THEN 1 ELSE 0 END) AS No
    FROM telecom.silver.teleco_cleaned
    GROUP BY Partner
    ORDER BY Partner

In [0]:
df_churn_by_partner = spark.sql("""
    SELECT Partner,
    SUM (CASE WHEN churn = 'Yes' THEN 1 ELSE 0 END) AS Yes,
    SUM (CASE WHEN churn = 'No' THEN 1 ELSE 0 END) AS No
    FROM telecom.silver.teleco_cleaned
    GROUP BY Partner
    ORDER BY Partner
""").toPandas()


In [0]:
df_churn_by_partner = df_churn_by_partner.melt(id_vars='Partner', value_vars=['Yes','No'], var_name='Churn', value_name='total')

In [0]:
grafico_chrun_vs_x(df_churn_by_partner,'Partner')

## Dependents x Chrun

In [0]:
%sql

    SELECT Dependents,
    SUM (CASE WHEN churn = 'Yes' THEN 1 ELSE 0 END) AS Yes,
    SUM (CASE WHEN churn = 'No' THEN 1 ELSE 0 END) AS No
    FROM telecom.silver.teleco_cleaned
    GROUP BY Dependents
    ORDER BY Dependents

In [0]:
df_churn_by_dep = spark.sql(
"""

    SELECT Dependents,
    SUM (CASE WHEN churn = 'Yes' THEN 1 ELSE 0 END) AS Yes,
    SUM (CASE WHEN churn = 'No' THEN 1 ELSE 0 END) AS No
    FROM telecom.silver.teleco_cleaned
    GROUP BY Dependents
    ORDER BY Dependents


""").toPandas()

In [0]:
df_churn_by_dep = df_churn_by_dep.melt(id_vars='Dependents', value_vars=['Yes','No'], var_name='Churn', value_name='total')

In [0]:
grafico_chrun_vs_x(df_churn_by_dep,'Dependents')

In [0]:
## Distribuição de churn por tipo de contrato

df_contrato = spark.sql("""

    SELECT Contract,
           SUM(CASE WHEN Churn = 'Yes' THEN 1 ELSE 0 END) AS churned
    FROM telecom.silver.teleco_cleaned
    GROUP BY Contract

""").toPandas()


# Ordena os contratos do maior para o menor número de churn
df_churn_by_contract = df_contrato.sort_values(by="churned", ascending=False)

plt.figure(figsize=(8, 5))
ax = sns.barplot(data=df_churn_by_contract, x="Contract", y="churned", palette=palette)
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)


for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{int(height)}', xy=(p.get_x() + p.get_width() / 2, height),
                xytext=(0, 1), textcoords='offset points',
                ha='center', va='bottom', fontsize=10, color='black')

plt.title("Número de Clientes que Deram Churn por Tipo de Contrato")
plt.xlabel("Tipo de Contrato")
plt.ylabel("Quantidade de Churns")



plt.tight_layout()
plt.show()