In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("test_session").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [3]:
import sys, os, time, getpass

sys.path.append("/home/tatiane/lib/")

import pessoal
from pessoal import *
#print('AppID: ', sc.applicationId)

Tempo inicial da execucao: 2025-10-30 10:54:20.014363
User: tatiane
Node: tatiane-Inspiron-3583


## Leitura

In [4]:
csv_path = "/home/tatiane/data-projects/01_data_cleaning/et_rh/raw/data/database_rh.csv"
csv_path2 = "/home/tatiane/data-projects/01_data_cleaning/et_rh/refined/data/clean_transformed_rh.csv"
df_original = spark.read.csv(csv_path, header=True, inferSchema=True)
df_modificado = spark.read.csv(csv_path2, header=True, inferSchema=True)

                                                                                

- Completude

In [5]:
print('Base original:')
pessoal.completude(df_original)
print('Base Modificada:')
pessoal.completude(df_modificado)

Base original:


                                                                                

Qtd. registros: 14900 | Quantidade de colunas:  24
Base Modificada:
Qtd. registros: 14900 | Quantidade de colunas:  24


In [6]:
comparacao = completudeComparativa(df_original, df_modificado)
comparacao

                                                                                

Unnamed: 0,coluna,duplicados_antes,duplicados_depois,nulos_antes,nulos_depois,valores_unicos_antes,valores_unicos_depois
0,Employee ID,0,0,0,0,14900,14900
1,Age,14858,14858,0,0,42,42
2,Gender,14898,14898,0,0,2,2
3,Years at Company,14849,14849,0,0,51,51
4,Job Role,14895,14895,0,0,5,5
5,Monthly Income,7972,7972,0,0,6928,6928
6,Work-Life Balance,14896,14896,0,0,4,4
7,Job Satisfaction,14896,14896,0,0,4,4
8,Performance Rating,14896,14896,0,0,4,4
9,Number of Promotions,14895,14895,0,0,5,5


### Validação das variáveis
- Nomes
- Tipos
- Conteúdos

In [7]:
cols_before = df_original.columns
cols_after = df_modificado.columns

# Função para normalizar 
def normalize_name(name):
    name = name.lower()
    name = re.sub(r'[^a-z0-9]+', '_', name)
    name = name.strip('_')
    return name

mapping = [
    {
        "coluna_antes": old,
        "coluna_depois": normalize_name(old),
        "existe_nome_modificado": normalize_name(old) in cols_after
    }
    for old in cols_before
]

comparison_names = pd.DataFrame(mapping)
comparison_names


Unnamed: 0,coluna_antes,coluna_depois,existe_nome_modificado
0,Employee ID,employee_id,True
1,Age,age,True
2,Gender,gender,True
3,Years at Company,years_at_company,True
4,Job Role,job_role,True
5,Monthly Income,monthly_income,True
6,Work-Life Balance,work_life_balance,True
7,Job Satisfaction,job_satisfaction,True
8,Performance Rating,performance_rating,True
9,Number of Promotions,number_of_promotions,True


In [8]:
# Criação DataFrames Spark com colunas e tipos
df_before = spark.createDataFrame(
    [(f.name, f.dataType.simpleString(), normalize_name(f.name)) for f in df_original.schema.fields],
    ["coluna_original", "tipo_antes", "coluna_norm"]
)

df_after = spark.createDataFrame(
    [(f.name, f.dataType.simpleString(), normalize_name(f.name)) for f in df_modificado.schema.fields],
    ["coluna_modificada", "tipo_depois", "coluna_norm"]
)

comparison_spark = (
    df_before.join(df_after, on="coluna_norm", how="outer")
              .withColumn("mudou_tipo", df_before.tipo_antes != df_after.tipo_depois)
)

# Conversão para pandas
comparison_pd = comparison_spark.toPandas().sort_values("coluna_norm")

# Quadro comparativo
comparison_pd[["coluna_original", "coluna_modificada", "tipo_antes", "tipo_depois", "mudou_tipo"]]


                                                                                

Unnamed: 0,coluna_original,coluna_modificada,tipo_antes,tipo_depois,mudou_tipo
0,Age,age,int,int,False
1,Attrition,attrition,string,int,True
2,Company Reputation,company_reputation,string,int,True
3,Company Size,company_size,string,int,True
4,Company Tenure,company_tenure,int,int,False
5,Distance from Home,distance_from_home,int,int,False
6,Education Level,education_level,string,int,True
7,Employee ID,employee_id,int,int,False
8,Employee Recognition,employee_recognition,string,int,True
9,Gender,gender,string,int,True


In [9]:
valid_distrib(df_original, df_modificado)

### Variável: Employee ID → employee_id

Unnamed: 0,Employee ID antigo,count_antes,employee_id novo,count_depois
0,73470,1,73470,1
1,61793,1,61793,1
2,67753,1,67753,1
3,57020,1,57020,1
4,43688,1,43688,1
...,...,...,...,...
14895,22782,1,22782,1
14896,69438,1,69438,1
14897,23279,1,23279,1
14898,2376,1,2376,1


### Variável: Age → age

Unnamed: 0,Age antigo,count_antes,age novo,count_depois
0,31,362,31,362
1,53,386,53,386
2,34,374,34,374
3,28,340,28,340
4,26,340,26,340
5,27,369,27,369
6,44,326,44,326
7,22,358,22,358
8,47,313,47,313
9,52,342,52,342


### Variável: Gender → gender

Unnamed: 0,Gender antigo,count_antes,gender novo,count_depois
0,Female,6813,1,6813
1,Male,8087,2,8087


### Variável: Years at Company → years_at_company

Unnamed: 0,Years at Company antigo,count_antes,years_at_company novo,count_depois
0,31,182,31,182
1,34,111,34,111
2,28,209,28,209
3,26,257,26,257
4,27,223,27,223
5,44,50,44,50
6,12,552,12,552
7,22,322,22,322
8,47,35,47,35
9,1,635,1,635


### Variável: Job Role → job_role

Unnamed: 0,Job Role antigo,count_antes,job_role novo,count_depois
0,Education,3168,1,3168
1,Healthcare,3432,3,2063
2,Finance,2063,5,3815
3,Media,2422,4,2422
4,Technology,3815,2,3432


### Variável: Monthly Income → monthly_income

Unnamed: 0,Monthly Income antigo,count_antes,monthly_income novo,count_depois
0,6357,3,6357,3
1,8592,3,8592,3
2,7993,4,7993,4
3,9900,2,9900,2
4,7982,3,7982,3
...,...,...,...,...
6923,6477,1,6477,1
6924,6819,1,6819,1
6925,11660,1,11660,1
6926,8054,1,8054,1


### Variável: Work-Life Balance → work_life_balance

Unnamed: 0,Work-Life Balance antigo,count_antes,work_life_balance novo,count_depois
0,Excellent,2713,1,2713
1,Good,5630,3,4483
2,Fair,4483,4,2074
3,Poor,2074,2,5630


### Variável: Job Satisfaction → job_satisfaction

Unnamed: 0,Job Satisfaction antigo,count_antes,job_satisfaction novo,count_depois
0,High,7466,1,7466
1,Very High,2968,3,2900
2,Low,1566,4,1566
3,Medium,2900,2,2968


### Variável: Performance Rating → performance_rating

Unnamed: 0,Performance Rating antigo,count_antes,performance_rating novo,count_depois
0,High,3022,1,3022
1,Low,780,3,8909
2,Average,8909,4,2189
3,Below Average,2189,2,780


### Variável: Number of Promotions → number_of_promotions

Unnamed: 0,Number of Promotions antigo,count_antes,number_of_promotions novo,count_depois
0,1,3716,1,3716
1,3,804,3,804
2,4,201,4,201
3,2,2750,2,2750
4,0,7429,0,7429


### Variável: Overtime → overtime

Unnamed: 0,Overtime antigo,count_antes,overtime novo,count_depois
0,No,10009,1,4891
1,Yes,4891,2,10009


### Variável: Distance from Home → distance_from_home

Unnamed: 0,Distance from Home antigo,count_antes,distance_from_home novo,count_depois
0,31,149,31,149
1,85,146,85,146
2,65,126,65,126
3,53,177,53,177
4,78,153,78,153
...,...,...,...,...
94,67,173,67,173
95,18,140,18,140
96,74,167,74,167
97,36,147,36,147


### Variável: Education Level → education_level

Unnamed: 0,Education Level antigo,count_antes,education_level novo,count_depois
0,High School,2932,1,2932
1,Master’s Degree,3001,3,728
2,PhD,728,5,3734
3,Bachelor’s Degree,4505,4,4505
4,Associate Degree,3734,2,3001


### Variável: Marital Status → marital_status

Unnamed: 0,Marital Status antigo,count_antes,marital_status novo,count_depois
0,Married,7511,1,7511
1,Divorced,2223,3,5166
2,Single,5166,2,2223


### Variável: Number of Dependents → number_of_dependents

Unnamed: 0,Number of Dependents antigo,count_antes,number_of_dependents novo,count_depois
0,1,3821,1,3821
1,6,129,6,129
2,3,2126,3,2126
3,5,605,5,605
4,4,1554,4,1554
5,2,2255,2,2255
6,0,4410,0,4410


### Variável: Job Level → job_level

Unnamed: 0,Job Level antigo,count_antes,job_level novo,count_depois
0,Senior,3062,1,3062
1,Mid,5925,3,5913
2,Entry,5913,2,5925


### Variável: Company Size → company_size

Unnamed: 0,Company Size antigo,count_antes,company_size novo,count_depois
0,Medium,7486,1,7486
1,Small,4420,3,2994
2,Large,2994,2,4420


### Variável: Company Tenure → company_tenure

Unnamed: 0,Company Tenure antigo,count_antes,company_tenure novo,count_depois
0,31,165,31,165
1,85,152,85,152
2,65,192,65,192
3,53,200,53,200
4,78,183,78,183
...,...,...,...,...
121,18,117,18,117
122,74,202,74,202
123,104,43,104,43
124,36,172,36,172


### Variável: Remote Work → remote_work

Unnamed: 0,Remote Work antigo,count_antes,remote_work novo,count_depois
0,No,12061,1,2839
1,Yes,2839,2,12061


### Variável: Leadership Opportunities → leadership_opportunities

Unnamed: 0,Leadership Opportunities antigo,count_antes,leadership_opportunities novo,count_depois
0,No,14165,1,735
1,Yes,735,2,14165


### Variável: Innovation Opportunities → innovation_opportunities

Unnamed: 0,Innovation Opportunities antigo,count_antes,innovation_opportunities novo,count_depois
0,No,12499,1,2401
1,Yes,2401,2,12499


### Variável: Company Reputation → company_reputation

Unnamed: 0,Company Reputation antigo,count_antes,company_reputation novo,count_depois
0,Excellent,1433,1,1433
1,Good,7416,3,2969
2,Fair,2969,4,3082
3,Poor,3082,2,7416


### Variável: Employee Recognition → employee_recognition

Unnamed: 0,Employee Recognition antigo,count_antes,employee_recognition novo,count_depois
0,High,3706,1,3706
1,Very High,708,3,4624
2,Low,5862,4,5862
3,Medium,4624,2,708


### Variável: Attrition → attrition

Unnamed: 0,Attrition antigo,count_antes,attrition novo,count_depois
0,Left,7032,1,7032
1,Stayed,7868,2,7868


### Finalização do notebook

In [15]:
executionTime()

Tempo de execucao ate este ponto: 0:03:49.803178
