
# Modelo de Classificação de escolas baseada em dados do INEP (2014-2023)

## Bibliotecas

In [31]:
import pandas as pd
import wbdata
import datetime
import pycountry
import geopandas


## Importação dos Dados

In [6]:
# Download latest version
path = "./dataset/master.csv"

# Carregar o arquivo CSV no DataFrame
df = pd.read_csv(path)


## Análise Exploratória de Dados

In [7]:
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [None]:
# Remover coluna country-year por motivos de redundância
df = df.drop(columns=["country-year"])

In [12]:
antes = len(df)

# Removendo Duplicatas
df.drop_duplicates(ignore_index=True,inplace=True)

depois = len(df)

print(f"Linhas duplicadas existentes e removidas = {depois - antes}")

Linhas duplicadas existentes e removidas = 0


- Informações sobre Dataframe

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             27820 non-null  object 
 1   year                27820 non-null  int64  
 2   sex                 27820 non-null  object 
 3   age                 27820 non-null  object 
 4   suicides_no         27820 non-null  int64  
 5   population          27820 non-null  int64  
 6   suicides/100k pop   27820 non-null  float64
 7   HDI for year        8364 non-null   float64
 8    gdp_for_year ($)   27820 non-null  object 
 9   gdp_per_capita ($)  27820 non-null  int64  
 10  generation          27820 non-null  object 
dtypes: float64(2), int64(4), object(5)
memory usage: 2.3+ MB


- Estatística Básica

In [14]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,27820.0,2001.258,8.469055,1985.0,1995.0,2002.0,2008.0,2016.0
suicides_no,27820.0,242.5744,902.0479,0.0,3.0,25.0,131.0,22338.0
population,27820.0,1844794.0,3911779.0,278.0,97498.5,430150.0,1486143.25,43805210.0
suicides/100k pop,27820.0,12.8161,18.96151,0.0,0.92,5.99,16.62,224.97
HDI for year,8364.0,0.7766011,0.09336671,0.483,0.713,0.779,0.855,0.944
gdp_per_capita ($),27820.0,16866.46,18887.58,251.0,3447.0,9372.0,24874.0,126352.0


- Contando Nulos ou NaN

In [25]:
len(df)

27820

In [24]:
df.isnull().sum()

country                   0
year                      0
sex                       0
age                       0
suicides_no               0
population                0
suicides/100k pop         0
HDI for year          19456
 gdp_for_year ($)         0
gdp_per_capita ($)        0
generation                0
dtype: int64

In [18]:
df.head(20)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,,2156624900,796,Boomers
5,Albania,1987,female,75+ years,1,35600,2.81,,2156624900,796,G.I. Generation
6,Albania,1987,female,35-54 years,6,278800,2.15,,2156624900,796,Silent
7,Albania,1987,female,25-34 years,4,257200,1.56,,2156624900,796,Boomers
8,Albania,1987,male,55-74 years,1,137500,0.73,,2156624900,796,G.I. Generation
9,Albania,1987,female,5-14 years,0,311000,0.0,,2156624900,796,Generation X


- Preencher os dados nulos com informações do World Bank

In [None]:
# Função para mapear o nome do país para seu código ISO (2 letras)
def get_country_code(country_name):
    try:
        country = pycountry.countries.get(name=country_name)
        return country.alpha_2  # Código ISO de 2 letras
    except AttributeError:
        return None  # Retorna None se o país não for encontrado

# Função para buscar PIB
def get_gdp_data(country_code, year):
    indicator = {'NY.GDP.MKTP.CD': 'GDP'}
    start_date = datetime.datetime(year, 1, 1)
    end_date = datetime.datetime(year, 12, 31)
    
    try:
        gdp_data = wbdata.get_dataframe(indicator, country=country_code, data_date=(start_date, end_date))
        return gdp_data['GDP'][0]
    except:
        return None

# Exemplo: Preenchendo com PIB de uma API
for index, row in df[df['HDI for year'].isnull()].iterrows():
    country = row['country']
    year = row['year']
    country_code = get_country_code(country)  # Aqui você precisaria de uma função para buscar o código ISO
    gdp_value = get_gdp_data(country_code, year)
    
    if gdp_value:
        df.at[index, 'HDI for year'] = gdp_value  # Preenche o valor nulo com o PIB
