# Brazilian Cities Analysis for HDI Prediction

This notebook aims to analyse machine learning regression models to predict the Human Development Index (HDI) of Brazilian cities. The dataset used in this notebook is the Brazilian Cities Dataset, which contains information about the HDI and other features of Brazilian cities.

Data Source: https://www.kaggle.com/crisparada/brazilian-cities
- All the data is in the data directory of the repository.

## Dataset Aquistion and Data Preparation

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns

# Remove the limiter on the number of columns displayed
pd.set_option('display.max_columns', None)

# Load the data
df = pd.read_csv('data/BRAZIL_CITIES_REV2022.CSV')
df.head()

Unnamed: 0,CITY,STATE,CAPITAL,IBGE_RES_POP,IBGE_RES_POP_BRAS,IBGE_RES_POP_ESTR,IBGE_DU,IBGE_DU_URBAN,IBGE_DU_RURAL,IBGE_POP,IBGE_1,IBGE_1-4,IBGE_5-9,IBGE_10-14,IBGE_15-59,IBGE_60+,IBGE_PLANTED_AREA,IBGE_CROP_PRODUCTION_$,IDHM Ranking 2010,IDHM,IDHM_Renda,IDHM_Longevidade,IDHM_Educacao,LONG,LAT,ALT,PAY_TV,FIXED_PHONES,AREA,REGIAO_TUR,CATEGORIA_TUR,ESTIMATED_POP,RURAL_URBAN,GVA_AGROPEC,GVA_INDUSTRY,GVA_SERVICES,GVA_PUBLIC,GVA_TOTAL,TAXES,GDP,POP_GDP,GDP_CAPITA,GVA_MAIN,MUN_EXPENDIT,COMP_TOT,COMP_A,COMP_B,COMP_C,COMP_D,COMP_E,COMP_F,COMP_G,COMP_H,COMP_I,COMP_J,COMP_K,COMP_L,COMP_M,COMP_N,COMP_O,COMP_P,COMP_Q,COMP_R,COMP_S,COMP_T,COMP_U,HOTELS,BEDS,Pr_Agencies,Pu_Agencies,Pr_Bank,Pu_Bank,Pr_Assets,Pu_Assets,Cars,Motorcycles,Wheeled_tractor,UBER,MAC,WAL-MART,POST_OFFICES
0,Abadia De Goiás,GO,0,6876,6876,0,2137,1546,591,5300,69,318,438,517,3542,416,319,1843,1689,0.708,0.687,0.83,0.622,-49.4405,-16.7588,893.6015,360,842,147.256,0,0,8583,Urbano,6201,27991,74750,36915,145858,20554,166412,8053,20665.0,Demais serviços,28227690,284,5,1,56,0,2,29,110,26,4,5,0,2,10,12,4,6,6,1,5,0,0,0,0,0,0,0,0,0,0,2158,1246,0,0,0,0,1
1,Abadia Dos Dourados,MG,0,6704,6704,0,2328,1481,847,4154,38,207,260,351,2709,589,4479,18017,2207,0.689,0.693,0.839,0.563,-47.3968,-18.4876,753.1249,77,296,881.064,Caminhos Do Cerrado,D,6972,Rural Adjacente,50525,25918,62689,28084,167215,12873,180089,7037,25592.0,Demais serviços,17909274,476,6,6,30,1,2,34,190,70,28,11,0,4,15,29,2,9,14,6,19,0,0,0,0,0,0,0,0,0,0,2227,1142,0,0,0,0,1
2,Abadiânia,GO,0,15757,15609,148,4655,3233,1422,10656,139,650,894,1087,6896,990,10307,33085,2202,0.689,0.671,0.841,0.579,-48.7188,-16.1827,1017.5506,227,720,1045.127,Região Turística Do Ouro E Cristais,C,19614,Rural Adjacente,42839,16728,138199,63396,261162,26823,287984,18427,15628.0,Demais serviços,37513019,288,5,9,26,0,2,7,117,12,57,2,1,0,7,15,3,11,5,1,8,0,0,1,34,1,1,1,1,33724584,67091904,2838,1426,0,0,0,0,3
3,Abaetetuba,PA,0,141100,141040,60,31061,19057,12004,82956,1354,5567,7618,8905,53516,5996,25200,700872,3530,0.628,0.579,0.798,0.537,-48.8844,-1.7235,10.1203,3389,1218,1610.651,Araguaia-Tocantins,D,156292,Urbano,140464,58610,468129,486872,1154075,95180,1249255,151934,8222.0,"Administração, defesa, educação e saúde públic...",0,931,4,2,43,0,1,27,500,16,31,6,1,1,22,16,2,155,33,15,56,0,0,0,0,2,4,2,4,76181384,800078483,5277,25661,0,0,0,0,2
4,Abaeté,MG,0,22690,22690,0,7694,6667,1027,18464,176,856,1233,1539,11979,2681,1862,7502,1994,0.698,0.72,0.848,0.556,-45.4462,-19.1558,644.7399,1230,1716,1817.067,Lago De Três Marias,D,23223,Urbano,113825,31003,172333,86081,403241,26994,430235,23574,18250.0,Demais serviços,0,621,18,1,40,0,1,20,303,62,30,9,6,4,28,27,2,15,19,9,27,0,0,0,0,2,2,2,2,44974716,371922572,6928,2953,0,0,0,0,4


In [4]:
# Removing the columns that are too conected to the target variable (IDHM_Renda, IDHM Ranking 2010 etc)
df = df.drop(columns=['IDHM Ranking 2010', 'IDHM_Renda', 'IDHM_Longevidade', 'IDHM_Educacao' ])
df.head()

Unnamed: 0,CITY,STATE,CAPITAL,IBGE_RES_POP,IBGE_RES_POP_BRAS,IBGE_RES_POP_ESTR,IBGE_DU,IBGE_DU_URBAN,IBGE_DU_RURAL,IBGE_POP,IBGE_1,IBGE_1-4,IBGE_5-9,IBGE_10-14,IBGE_15-59,IBGE_60+,IBGE_PLANTED_AREA,IBGE_CROP_PRODUCTION_$,IDHM,LONG,LAT,ALT,PAY_TV,FIXED_PHONES,AREA,REGIAO_TUR,CATEGORIA_TUR,ESTIMATED_POP,RURAL_URBAN,GVA_AGROPEC,GVA_INDUSTRY,GVA_SERVICES,GVA_PUBLIC,GVA_TOTAL,TAXES,GDP,POP_GDP,GDP_CAPITA,GVA_MAIN,MUN_EXPENDIT,COMP_TOT,COMP_A,COMP_B,COMP_C,COMP_D,COMP_E,COMP_F,COMP_G,COMP_H,COMP_I,COMP_J,COMP_K,COMP_L,COMP_M,COMP_N,COMP_O,COMP_P,COMP_Q,COMP_R,COMP_S,COMP_T,COMP_U,HOTELS,BEDS,Pr_Agencies,Pu_Agencies,Pr_Bank,Pu_Bank,Pr_Assets,Pu_Assets,Cars,Motorcycles,Wheeled_tractor,UBER,MAC,WAL-MART,POST_OFFICES
0,Abadia De Goiás,GO,0,6876,6876,0,2137,1546,591,5300,69,318,438,517,3542,416,319,1843,0.708,-49.4405,-16.7588,893.6015,360,842,147.256,0,0,8583,Urbano,6201,27991,74750,36915,145858,20554,166412,8053,20665.0,Demais serviços,28227690,284,5,1,56,0,2,29,110,26,4,5,0,2,10,12,4,6,6,1,5,0,0,0,0,0,0,0,0,0,0,2158,1246,0,0,0,0,1
1,Abadia Dos Dourados,MG,0,6704,6704,0,2328,1481,847,4154,38,207,260,351,2709,589,4479,18017,0.689,-47.3968,-18.4876,753.1249,77,296,881.064,Caminhos Do Cerrado,D,6972,Rural Adjacente,50525,25918,62689,28084,167215,12873,180089,7037,25592.0,Demais serviços,17909274,476,6,6,30,1,2,34,190,70,28,11,0,4,15,29,2,9,14,6,19,0,0,0,0,0,0,0,0,0,0,2227,1142,0,0,0,0,1
2,Abadiânia,GO,0,15757,15609,148,4655,3233,1422,10656,139,650,894,1087,6896,990,10307,33085,0.689,-48.7188,-16.1827,1017.5506,227,720,1045.127,Região Turística Do Ouro E Cristais,C,19614,Rural Adjacente,42839,16728,138199,63396,261162,26823,287984,18427,15628.0,Demais serviços,37513019,288,5,9,26,0,2,7,117,12,57,2,1,0,7,15,3,11,5,1,8,0,0,1,34,1,1,1,1,33724584,67091904,2838,1426,0,0,0,0,3
3,Abaetetuba,PA,0,141100,141040,60,31061,19057,12004,82956,1354,5567,7618,8905,53516,5996,25200,700872,0.628,-48.8844,-1.7235,10.1203,3389,1218,1610.651,Araguaia-Tocantins,D,156292,Urbano,140464,58610,468129,486872,1154075,95180,1249255,151934,8222.0,"Administração, defesa, educação e saúde públic...",0,931,4,2,43,0,1,27,500,16,31,6,1,1,22,16,2,155,33,15,56,0,0,0,0,2,4,2,4,76181384,800078483,5277,25661,0,0,0,0,2
4,Abaeté,MG,0,22690,22690,0,7694,6667,1027,18464,176,856,1233,1539,11979,2681,1862,7502,0.698,-45.4462,-19.1558,644.7399,1230,1716,1817.067,Lago De Três Marias,D,23223,Urbano,113825,31003,172333,86081,403241,26994,430235,23574,18250.0,Demais serviços,0,621,18,1,40,0,1,20,303,62,30,9,6,4,28,27,2,15,19,9,27,0,0,0,0,2,2,2,2,44974716,371922572,6928,2953,0,0,0,0,4


In [5]:
# Checking for missing values
df.isnull().sum()

# Since there are no missing values, we can proceed to the next step

CITY                 0
STATE                0
CAPITAL              0
IBGE_RES_POP         0
IBGE_RES_POP_BRAS    0
                    ..
Wheeled_tractor      0
UBER                 0
MAC                  0
WAL-MART             0
POST_OFFICES         0
Length: 77, dtype: int64

In [6]:
# Droping the city name column
df = df.drop(columns=['CITY'])
# Droping the REGIAO_TUR column
df = df.drop(columns=['REGIAO_TUR'])

# Making the state column categorical (0, 1...)
df['STATE'] = pd.Categorical(df['STATE'])
df['STATE'] = df['STATE'].cat.codes

# Making the CATEGORIA_TUR column categorical (0, 1...)
df['CATEGORIA_TUR'] = pd.Categorical(df['CATEGORIA_TUR'])
df['CATEGORIA_TUR'] = df['CATEGORIA_TUR'].cat.codes

# Making the RURAL_URBAN column categorical (0, 1...)
df['RURAL_URBAN'] = pd.Categorical(df['RURAL_URBAN'])
df['RURAL_URBAN'] = df['RURAL_URBAN'].cat.codes

# Making the GVA_MAIN column categorical (0, 1...)
df['GVA_MAIN'] = pd.Categorical(df['GVA_MAIN'])
df['GVA_MAIN'] = df['GVA_MAIN'].cat.codes

# check for nan values
print(df.isnull().sum().sum())

df.head()

0


Unnamed: 0,STATE,CAPITAL,IBGE_RES_POP,IBGE_RES_POP_BRAS,IBGE_RES_POP_ESTR,IBGE_DU,IBGE_DU_URBAN,IBGE_DU_RURAL,IBGE_POP,IBGE_1,IBGE_1-4,IBGE_5-9,IBGE_10-14,IBGE_15-59,IBGE_60+,IBGE_PLANTED_AREA,IBGE_CROP_PRODUCTION_$,IDHM,LONG,LAT,ALT,PAY_TV,FIXED_PHONES,AREA,CATEGORIA_TUR,ESTIMATED_POP,RURAL_URBAN,GVA_AGROPEC,GVA_INDUSTRY,GVA_SERVICES,GVA_PUBLIC,GVA_TOTAL,TAXES,GDP,POP_GDP,GDP_CAPITA,GVA_MAIN,MUN_EXPENDIT,COMP_TOT,COMP_A,COMP_B,COMP_C,COMP_D,COMP_E,COMP_F,COMP_G,COMP_H,COMP_I,COMP_J,COMP_K,COMP_L,COMP_M,COMP_N,COMP_O,COMP_P,COMP_Q,COMP_R,COMP_S,COMP_T,COMP_U,HOTELS,BEDS,Pr_Agencies,Pu_Agencies,Pr_Bank,Pu_Bank,Pr_Assets,Pu_Assets,Cars,Motorcycles,Wheeled_tractor,UBER,MAC,WAL-MART,POST_OFFICES
0,8,0,6876,6876,0,2137,1546,591,5300,69,318,438,517,3542,416,319,1843,0.708,-49.4405,-16.7588,893.6015,360,842,147.256,0,8583,6,6201,27991,74750,36915,145858,20554,166412,8053,20665.0,5,28227690,284,5,1,56,0,2,29,110,26,4,5,0,2,10,12,4,6,6,1,5,0,0,0,0,0,0,0,0,0,0,2158,1246,0,0,0,0,1
1,10,0,6704,6704,0,2328,1481,847,4154,38,207,260,351,2709,589,4479,18017,0.689,-47.3968,-18.4876,753.1249,77,296,881.064,4,6972,3,50525,25918,62689,28084,167215,12873,180089,7037,25592.0,5,17909274,476,6,6,30,1,2,34,190,70,28,11,0,4,15,29,2,9,14,6,19,0,0,0,0,0,0,0,0,0,0,2227,1142,0,0,0,0,1
2,8,0,15757,15609,148,4655,3233,1422,10656,139,650,894,1087,6896,990,10307,33085,0.689,-48.7188,-16.1827,1017.5506,227,720,1045.127,3,19614,3,42839,16728,138199,63396,261162,26823,287984,18427,15628.0,5,37513019,288,5,9,26,0,2,7,117,12,57,2,1,0,7,15,3,11,5,1,8,0,0,1,34,1,1,1,1,33724584,67091904,2838,1426,0,0,0,0,3
3,13,0,141100,141040,60,31061,19057,12004,82956,1354,5567,7618,8905,53516,5996,25200,700872,0.628,-48.8844,-1.7235,10.1203,3389,1218,1610.651,4,156292,6,140464,58610,468129,486872,1154075,95180,1249255,151934,8222.0,1,0,931,4,2,43,0,1,27,500,16,31,6,1,1,22,16,2,155,33,15,56,0,0,0,0,2,4,2,4,76181384,800078483,5277,25661,0,0,0,0,2
4,10,0,22690,22690,0,7694,6667,1027,18464,176,856,1233,1539,11979,2681,1862,7502,0.698,-45.4462,-19.1558,644.7399,1230,1716,1817.067,4,23223,6,113825,31003,172333,86081,403241,26994,430235,23574,18250.0,5,0,621,18,1,40,0,1,20,303,62,30,9,6,4,28,27,2,15,19,9,27,0,0,0,0,2,2,2,2,44974716,371922572,6928,2953,0,0,0,0,4


In [7]:
# Calculate the correlation matrix
corr = df.corr()

# Filter the correlation matrix to show only the correlations with the 'IDHM' column that are above the threshold
threshold = 0.1  # You can adjust this threshold
filtered_corr = corr['IDHM'][corr['IDHM'].abs() >= threshold]

# Extract the column names
columns_with_high_corr = filtered_corr.index.tolist()

# Print the list of column names with desired correlation
print(columns_with_high_corr)

# Drop all columns that are not in the list of columns with high correlation
df = df[columns_with_high_corr]
df.head()

['STATE', 'IBGE_RES_POP', 'IBGE_RES_POP_BRAS', 'IBGE_DU', 'IBGE_DU_URBAN', 'IBGE_DU_RURAL', 'IBGE_POP', 'IBGE_1', 'IBGE_1-4', 'IBGE_5-9', 'IBGE_10-14', 'IBGE_15-59', 'IBGE_60+', 'IBGE_PLANTED_AREA', 'IBGE_CROP_PRODUCTION_$', 'IDHM', 'LONG', 'LAT', 'ALT', 'PAY_TV', 'FIXED_PHONES', 'AREA', 'CATEGORIA_TUR', 'ESTIMATED_POP', 'RURAL_URBAN', 'GVA_AGROPEC', 'GVA_INDUSTRY', 'GVA_SERVICES', 'GVA_PUBLIC', 'GVA_TOTAL', 'TAXES', 'GDP', 'POP_GDP', 'GDP_CAPITA', 'GVA_MAIN', 'MUN_EXPENDIT', 'COMP_TOT', 'COMP_A', 'COMP_B', 'COMP_C', 'COMP_E', 'COMP_F', 'COMP_G', 'COMP_H', 'COMP_I', 'COMP_L', 'COMP_M', 'COMP_N', 'COMP_O', 'COMP_P', 'COMP_Q', 'COMP_R', 'COMP_S', 'HOTELS', 'BEDS', 'Pr_Agencies', 'Pu_Agencies', 'Pr_Bank', 'Pu_Bank', 'Cars', 'Motorcycles', 'Wheeled_tractor', 'UBER', 'MAC', 'WAL-MART', 'POST_OFFICES']


Unnamed: 0,STATE,IBGE_RES_POP,IBGE_RES_POP_BRAS,IBGE_DU,IBGE_DU_URBAN,IBGE_DU_RURAL,IBGE_POP,IBGE_1,IBGE_1-4,IBGE_5-9,IBGE_10-14,IBGE_15-59,IBGE_60+,IBGE_PLANTED_AREA,IBGE_CROP_PRODUCTION_$,IDHM,LONG,LAT,ALT,PAY_TV,FIXED_PHONES,AREA,CATEGORIA_TUR,ESTIMATED_POP,RURAL_URBAN,GVA_AGROPEC,GVA_INDUSTRY,GVA_SERVICES,GVA_PUBLIC,GVA_TOTAL,TAXES,GDP,POP_GDP,GDP_CAPITA,GVA_MAIN,MUN_EXPENDIT,COMP_TOT,COMP_A,COMP_B,COMP_C,COMP_E,COMP_F,COMP_G,COMP_H,COMP_I,COMP_L,COMP_M,COMP_N,COMP_O,COMP_P,COMP_Q,COMP_R,COMP_S,HOTELS,BEDS,Pr_Agencies,Pu_Agencies,Pr_Bank,Pu_Bank,Cars,Motorcycles,Wheeled_tractor,UBER,MAC,WAL-MART,POST_OFFICES
0,8,6876,6876,2137,1546,591,5300,69,318,438,517,3542,416,319,1843,0.708,-49.4405,-16.7588,893.6015,360,842,147.256,0,8583,6,6201,27991,74750,36915,145858,20554,166412,8053,20665.0,5,28227690,284,5,1,56,2,29,110,26,4,2,10,12,4,6,6,1,5,0,0,0,0,0,0,2158,1246,0,0,0,0,1
1,10,6704,6704,2328,1481,847,4154,38,207,260,351,2709,589,4479,18017,0.689,-47.3968,-18.4876,753.1249,77,296,881.064,4,6972,3,50525,25918,62689,28084,167215,12873,180089,7037,25592.0,5,17909274,476,6,6,30,2,34,190,70,28,4,15,29,2,9,14,6,19,0,0,0,0,0,0,2227,1142,0,0,0,0,1
2,8,15757,15609,4655,3233,1422,10656,139,650,894,1087,6896,990,10307,33085,0.689,-48.7188,-16.1827,1017.5506,227,720,1045.127,3,19614,3,42839,16728,138199,63396,261162,26823,287984,18427,15628.0,5,37513019,288,5,9,26,2,7,117,12,57,0,7,15,3,11,5,1,8,1,34,1,1,1,1,2838,1426,0,0,0,0,3
3,13,141100,141040,31061,19057,12004,82956,1354,5567,7618,8905,53516,5996,25200,700872,0.628,-48.8844,-1.7235,10.1203,3389,1218,1610.651,4,156292,6,140464,58610,468129,486872,1154075,95180,1249255,151934,8222.0,1,0,931,4,2,43,1,27,500,16,31,1,22,16,2,155,33,15,56,0,0,2,4,2,4,5277,25661,0,0,0,0,2
4,10,22690,22690,7694,6667,1027,18464,176,856,1233,1539,11979,2681,1862,7502,0.698,-45.4462,-19.1558,644.7399,1230,1716,1817.067,4,23223,6,113825,31003,172333,86081,403241,26994,430235,23574,18250.0,5,0,621,18,1,40,1,20,303,62,30,4,28,27,2,15,19,9,27,0,0,2,2,2,2,6928,2953,0,0,0,0,4


In [8]:
# Dropping outliers

for column in df.columns:
    # Calculate the quartiles
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    # Filter the outliers
    df = df[(df[column] >= Q1 - 1.5 * IQR) & (df[column] <= Q3 + 1.5 * IQR)]
    
df.head()

Unnamed: 0,STATE,IBGE_RES_POP,IBGE_RES_POP_BRAS,IBGE_DU,IBGE_DU_URBAN,IBGE_DU_RURAL,IBGE_POP,IBGE_1,IBGE_1-4,IBGE_5-9,IBGE_10-14,IBGE_15-59,IBGE_60+,IBGE_PLANTED_AREA,IBGE_CROP_PRODUCTION_$,IDHM,LONG,LAT,ALT,PAY_TV,FIXED_PHONES,AREA,CATEGORIA_TUR,ESTIMATED_POP,RURAL_URBAN,GVA_AGROPEC,GVA_INDUSTRY,GVA_SERVICES,GVA_PUBLIC,GVA_TOTAL,TAXES,GDP,POP_GDP,GDP_CAPITA,GVA_MAIN,MUN_EXPENDIT,COMP_TOT,COMP_A,COMP_B,COMP_C,COMP_E,COMP_F,COMP_G,COMP_H,COMP_I,COMP_L,COMP_M,COMP_N,COMP_O,COMP_P,COMP_Q,COMP_R,COMP_S,HOTELS,BEDS,Pr_Agencies,Pu_Agencies,Pr_Bank,Pu_Bank,Cars,Motorcycles,Wheeled_tractor,UBER,MAC,WAL-MART,POST_OFFICES
39,16,5098,5098,1536,1100,436,3594,46,198,265,379,2100,606,2442,6319,0.599,-42.6592,-5.7955,260.4377,145,77,112.392,0,5148,3,1981,1454,6406,18108,27949,1571,29519,5069,5824.0,1,12532965,39,0,0,1,0,3,22,0,0,0,0,0,2,1,1,0,9,0,0,0,0,0,0,282,1185,0,0,0,0,1
54,10,2709,2706,875,364,511,1099,13,40,69,98,716,163,15,28,0.649,-44.6381,-22.1711,1115.5199,162,92,161.356,5,2683,3,5039,1801,8576,14932,30348,1088,31436,2756,11406.0,1,11323018,35,2,0,4,0,0,19,3,0,0,0,1,2,2,2,0,0,0,0,0,0,0,0,479,332,0,0,0,0,1
63,14,5239,5239,1567,639,928,2062,40,132,162,197,1294,237,516,47,0.578,-36.0606,-7.7415,518.6663,32,65,305.988,0,5473,3,3994,1383,6694,25987,38057,900,38957,5475,7115.0,1,13445071,37,1,0,5,0,0,16,1,0,0,0,1,2,6,1,0,3,0,0,0,0,0,0,297,274,0,0,0,0,1
82,14,2366,2366,676,353,323,1231,22,113,119,128,714,135,255,170,0.548,-36.0061,-6.9047,444.7022,72,17,220.248,0,2488,3,1417,675,4344,16145,22580,574,23154,2488,9306.0,1,12540881,18,0,0,1,0,0,3,0,0,0,0,1,2,1,0,0,10,0,0,0,0,0,0,198,104,0,0,0,0,1
101,9,11063,11063,2632,1118,1515,3154,31,204,316,399,1872,332,2045,4230,0.549,-45.4647,-4.1623,55.1921,302,44,524.373,0,8066,3,16154,2119,11951,30369,60594,1600,62194,10210,6091.0,1,21254157,22,0,0,0,0,0,18,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,196,1008,0,0,0,0,1


In [9]:
# Split the data into features and target
X = df.drop(columns=['IDHM'])
y = df['IDHM']

In [10]:
# Normalizing the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

# Show new data in a DataFrame
X = pd.DataFrame(X, columns=df.columns[:-1])
X.head()

Unnamed: 0,STATE,IBGE_RES_POP,IBGE_RES_POP_BRAS,IBGE_DU,IBGE_DU_URBAN,IBGE_DU_RURAL,IBGE_POP,IBGE_1,IBGE_1-4,IBGE_5-9,IBGE_10-14,IBGE_15-59,IBGE_60+,IBGE_PLANTED_AREA,IBGE_CROP_PRODUCTION_$,IDHM,LONG,LAT,ALT,PAY_TV,FIXED_PHONES,AREA,CATEGORIA_TUR,ESTIMATED_POP,RURAL_URBAN,GVA_AGROPEC,GVA_INDUSTRY,GVA_SERVICES,GVA_PUBLIC,GVA_TOTAL,TAXES,GDP,POP_GDP,GDP_CAPITA,GVA_MAIN,MUN_EXPENDIT,COMP_TOT,COMP_A,COMP_B,COMP_C,COMP_E,COMP_F,COMP_G,COMP_H,COMP_I,COMP_L,COMP_M,COMP_N,COMP_O,COMP_P,COMP_Q,COMP_R,COMP_S,HOTELS,BEDS,Pr_Agencies,Pu_Agencies,Pr_Bank,Pu_Bank,Cars,Motorcycles,Wheeled_tractor,UBER,MAC,WAL-MART
0,0.291441,0.110396,0.110524,0.407495,1.49982,-0.576276,1.138288,0.471102,0.471377,0.416617,0.979784,1.08514,2.587282,0.338767,0.401446,-0.280723,0.639919,-0.160272,1.141667,0.510078,-0.839646,-0.602987,0.062797,0.0,-0.783839,-0.209622,-0.552694,-0.48927,-0.721552,0.26286,-0.686274,0.015759,-0.968216,0.0,0.575906,0.917817,-0.60457,0.0,-0.133831,0.0,2.075313,0.742579,-0.625306,-0.562115,0.0,-0.667779,-1.309598,-0.33347,-0.094613,1.29766,0.0,1.414596,0.0,0.0,0.0,0.0,0.0,0.0,0.243768,2.628548,0.0,0.0,0.0,0.0,0.0
1,-0.558657,-0.909955,-0.911084,-0.745302,-0.792999,-0.401853,-0.912512,-0.9819,-1.117106,-1.032692,-0.921484,-0.854253,-0.671461,-0.689207,-0.550853,-0.739979,-2.603997,3.741002,1.381092,0.834479,-0.676344,1.801486,-0.9701,0.0,-0.309815,0.022471,0.089869,-0.936456,-0.559949,-0.368967,-0.561117,-0.939271,0.860037,0.0,0.412555,0.585786,2.548999,0.0,1.756127,0.0,-0.699759,0.407985,2.203227,-0.562115,0.0,-0.667779,0.027716,-0.33347,0.635798,3.125751,0.0,-1.158339,0.0,0.0,0.0,0.0,0.0,0.0,1.682946,-0.652881,0.0,0.0,0.0,0.0,0.0
2,0.008075,0.170618,0.170745,0.46156,0.063693,0.567937,-0.120961,0.20692,-0.192166,-0.34501,-0.251642,-0.044304,-0.127112,-0.477005,-0.547977,1.250656,0.254427,1.017884,-0.449807,0.250557,-0.193975,-0.602987,0.198981,0.0,-0.471802,-0.257111,-0.467414,0.620107,-0.040651,-0.614896,-0.070087,0.183394,-0.545379,0.0,0.699047,0.751802,0.972214,0.0,2.386113,0.0,-0.699759,0.073391,0.317538,-0.562115,0.0,-0.667779,0.027716,-0.33347,3.557443,1.29766,0.0,-0.300694,0.0,0.0,0.0,0.0,0.0,0.0,0.35335,-0.876003,0.0,0.0,0.0,0.0,0.0
3,0.008075,-1.056452,-1.056296,-1.092362,-0.827267,-0.839073,-0.804013,-0.585627,-0.383186,-0.66297,-0.718501,-0.857055,-0.877432,-0.587553,-0.529358,1.263304,0.420192,0.680427,0.113546,-0.787528,-0.47993,-0.602987,-1.05181,0.0,-0.871265,-0.73066,-1.163278,-0.765664,-1.083221,-1.041346,-1.101831,-1.049927,0.172231,0.0,0.576975,-0.825348,-0.60457,0.0,-0.133831,0.0,-0.699759,-1.376516,-0.625306,-0.562115,0.0,-0.667779,0.027716,-0.33347,-0.094613,-0.530431,0.0,1.700478,0.0,0.0,0.0,0.0,0.0,0.0,-0.369892,-1.529981,0.0,0.0,0.0,0.0,0.0
4,-0.70034,2.658072,2.658139,2.318941,1.555895,1.933086,0.776624,-0.189353,0.5317,0.793733,1.115106,0.765644,0.571716,0.170615,0.085224,-0.931814,0.963447,-1.096696,3.352829,-0.203605,0.534372,-0.602987,1.285513,0.0,1.413131,0.235167,1.089248,1.2371,1.4775,0.300795,1.447007,2.138458,-0.880766,0.0,1.753327,-0.493316,-0.60457,0.0,-0.763817,0.0,-0.699759,0.296454,-0.625306,-0.562115,0.0,-0.667779,-1.309598,-0.33347,-0.825024,-0.530431,0.0,-0.586576,0.0,0.0,0.0,0.0,0.0,0.0,-0.384503,1.947642,0.0,0.0,0.0,0.0,0.0


In [11]:
# Calculating PCA
from sklearn.decomposition import PCA

n_components = 6
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

# Seeing pca results
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.33023695, 0.09235606, 0.07334953, 0.06688054, 0.04159309,
       0.03526826])

In [12]:
# Separating the data using cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# Define the model
model = LinearRegression()

# Perform cross-validation
scores = cross_val_score(model, X_pca, y, cv=7)  # 5-fold cross-validation

# Print the cross-validation scores
print("Cross-validation scores:", scores)
print("Mean cross-validation score:", scores.mean())

Cross-validation scores: [0.44563968 0.34987063 0.54646844 0.50556823 0.61533868 0.22208942
 0.4984617 ]
Mean cross-validation score: 0.4547766820697001


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [14]:
# Split dados

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
# Normalização de Escalas 

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Knn regressão
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

# Avaliar o desempenho
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Erro Médio Absoluto (MAE): {mae:.2f}")
print(f"Erro Quadrático Médio (MSE): {mse:.2f}")
print(f"Coeficiente de Determinação (R²): {r2:.2f}")




Erro Médio Absoluto (MAE): 0.02
Erro Quadrático Médio (MSE): 0.00
Coeficiente de Determinação (R²): 0.61
