In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import shap
from pandasql import sqldf

In [37]:
df = pd.read_csv(
    'Meu_Municipio_Cobertura.csv',
    sep=';'
)

df.head(30)

Unnamed: 0,Ano,Mês,Operadora,Tecnologia,Código IBGE,Município,UF,Nome UF,Região,Código Nacional,% moradores cobertos,% domicílios cobertos,% área coberta
0,2021.0,11.0,Todas,4G,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,99911338304165,99910746714728,91859948680187
1,2021.0,11.0,LIGUE,2G,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,0,0,0
2,2021.0,11.0,VIVO,2G,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,95647555982772,95959903169683,66267932616656
3,2021.0,11.0,LIGUE,3G,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,0,0,0
4,2021.0,11.0,CLARO,Todas,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,99486127766182,99532104273336,82804391686534
5,2021.0,11.0,SERCOMTEL,4G,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,0,0,0
6,2021.0,11.0,SERCOMTEL,Todas,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,0,0,0
7,2021.0,11.0,LIGUE,3G4G,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,0,0,0
8,2021.0,11.0,VIVO,Todas,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,99659911044447,99668570130315,83879952754244
9,2021.0,11.0,VIVO,3G,4115200.0,Maringá - PR,PR,Paraná,Sul,44.0,99564145483379,9957622047023,80562800846473


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1515041 entries, 0 to 1515040
Data columns (total 13 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Ano                    1515040 non-null  float64
 1   Mês                    1515040 non-null  float64
 2   Operadora              1515040 non-null  object 
 3   Tecnologia             1515040 non-null  object 
 4   Código IBGE            1515040 non-null  float64
 5   Município              1515040 non-null  object 
 6   UF                     1515040 non-null  object 
 7   Nome UF                1515040 non-null  object 
 8   Região                 1515040 non-null  object 
 9   Código Nacional        1515040 non-null  float64
 10  % moradores cobertos   1513680 non-null  object 
 11  % domicílios cobertos  1513680 non-null  object 
 12  % área coberta         1513680 non-null  object 
dtypes: float64(4), object(9)
memory usage: 150.3+ MB


In [55]:
df.columns

Index(['Ano', 'Mês', 'Operadora', 'Tecnologia', 'Código IBGE', 'Município',
       'UF', 'Nome UF', 'Região', 'Código Nacional', '% moradores cobertos',
       '% domicílios cobertos', '% área coberta'],
      dtype='object')

In [95]:
df.describe()

Unnamed: 0,Ano,Mês,Código IBGE,Código Nacional,% moradores cobertos,% domicílios cobertos,% área coberta
count,1515040.0,1515040.0,1515040.0,1515040.0,1513680.0,1513680.0,1513680.0
mean,2022.441,8.911765,3253591.0,57.09964,20.50808,20.67536,9.908949
std,1.116486,3.450216,984822.2,25.42054,34.80325,34.98771,22.45501
min,2021.0,3.0,1100015.0,11.0,0.0,0.0,0.0
25%,2021.0,9.0,2512101.0,35.0,0.0,0.0,0.0
50%,2022.0,11.0,3146280.0,55.0,0.0,0.0,0.0
75%,2023.0,12.0,4119202.0,82.0,41.62521,42.97516,3.221171
max,2024.0,12.0,5300108.0,99.0,100.0,100.0,100.0


In [69]:
df['% moradores cobertos'] = df['% moradores cobertos'].str.replace(',', '.').astype(float)
df['% domicílios cobertos'] = df['% domicílios cobertos'].str.replace(',', '.').astype(float)
df['% área coberta'] = df['% área coberta'].str.replace(',', '.').astype(float)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1515041 entries, 0 to 1515040
Data columns (total 13 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Ano                    1515040 non-null  float64
 1   Mês                    1515040 non-null  float64
 2   Operadora              1515040 non-null  object 
 3   Tecnologia             1515040 non-null  object 
 4   Código IBGE            1515040 non-null  float64
 5   Município              1515040 non-null  object 
 6   UF                     1515040 non-null  object 
 7   Nome UF                1515040 non-null  object 
 8   Região                 1515040 non-null  object 
 9   Código Nacional        1515040 non-null  float64
 10  % moradores cobertos   1513680 non-null  float64
 11  % domicílios cobertos  1513680 non-null  float64
 12  % área coberta         1513680 non-null  float64
dtypes: float64(7), object(6)
memory usage: 150.3+ MB


In [107]:
pd.DataFrame(df.groupby(['Operadora','Tecnologia']).agg({
    '% moradores cobertos':'mean',
    '% domicílios cobertos':'mean',
    '% área coberta':'mean'
})).head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,% moradores cobertos,% domicílios cobertos,% área coberta
Operadora,Tecnologia,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALGAR,2G,1.190191,1.190218,0.344968
ALGAR,3G,1.739066,1.740096,0.329953
ALGAR,3G4G,0.512496,0.512752,0.154146
ALGAR,4G,1.079438,1.077552,0.416921
ALGAR,4G5G,1.114331,1.113439,0.443386
ALGAR,5G,0.168676,0.168495,0.035442
ALGAR,5G_SA_NSA,0.064326,0.064239,0.006073
ALGAR,Todas,1.891669,1.891964,0.652517
CLARO,2G,47.257169,47.829535,13.202492
CLARO,3G,53.139028,53.733603,16.663517


In [103]:
wow = df[
    (df['Tecnologia'] == '2G') & (df['Operadora'] == 'ALGAR')
]['% moradores cobertos']

wow = wow / 100
wow.mean()

0.011901912670293344

In [None]:


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

tree = DecisionTreeRegressor(max_depth=30, ccp_alpha=0)

grid_search = GridSearchCV(estimator=tree, 
                          param_grid=param_grid,
                          cv=5,  # 5-fold cross-validation
                          scoring='neg_mean_squared_error')  # Metrica de avaliação