# Classificação com Adult set


## Importando Pacotes e settings

In [None]:
import pandas as pd
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 6)

## Coletando as bases de dados de treino e teste

In [None]:
adult_train = pd.read_csv("/kaggle/input/adult-pmr3508/train_data.csv", na_values='?', index_col= 0)
adult_test = pd.read_csv("/kaggle/input/adult-pmr3508/test_data.csv", na_values='?', index_col= 0)

## Visualizando algumas informações do Dataset

In [None]:
#Mostra o formato da base de dados
adult_train.shape

In [None]:
#Mostra algumas informações das features
adult_train.info()

In [None]:
adult_test.shape

In [None]:
#Mostra o início da base de dados
adult_train.head()

In [None]:
#Mostra o final da base de dados
adult_train.tail()

In [None]:
# Mudando a feature income para melhor tratamento
# <=50k = 0, >50k = 1


adult_train ["income"] = np.where(adult_train["income"] == "<=50K", 0, 1)

In [None]:
#Verificando a mudança
adult_train.head()

## Verificação de dados faltantes

In [None]:
adult_train.isnull().sum()

In [None]:
adult_test.isnull().sum()

Retirando dados faltantes

## Análise exploratória dos dados

Gráfico de correlação para analisar a relação entre as features

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(adult_train.corr(), cmap='BuPu', annot = True, fmt = '.2f')
plt.show()

Nota-se que a coluna "fnlwgt" não possui uma relação boa com income, dentre as variáveis numéricas, portanto, ela será retirada

In [None]:
# Retirando a coluna
adult_train.drop('fnlwgt', axis = 1, inplace = True)

In [None]:
# Verificando a mudança
adult_train.head()

In [None]:
adult_test.drop('fnlwgt', axis = 1, inplace = True)

In [None]:
# Verifica o número de pessoas que possuem cada tipo de income
_ = adult_train["income"].value_counts().plot(kind = "bar", color = "purple")
_ = plt.title("N° de pessoas por tipo de income")

### Analisando relações entre algumas variáveis e o income

In [None]:
sns.catplot(y="age", x="income", kind="bar", data= adult_train);

In [None]:
sns.catplot(y="sex", x="income", kind="bar", data= adult_train);

In [None]:
sns.catplot(y="workclass", x="income", kind="bar", data= adult_train);

In [None]:
sns.catplot(y="education", x="income", kind="bar", data= adult_train);

In [None]:
sns.catplot(y="marital.status", x="income", kind="bar", data= adult_train);

In [None]:
sns.catplot(y="occupation", x="income", kind="bar", data= adult_train);

In [None]:
sns.catplot(y="relationship", x="income", kind="bar", data= adult_train);

In [None]:
sns.catplot(y="race", x="income", kind="bar", data= adult_train);

In [None]:
sns.catplot(y="hours.per.week", x="income", kind="bar", data= adult_train);

Tratando dados faltantes

In [None]:
nadult_train = adult_train.dropna()
nadult_test = adult_test.dropna()

In [None]:
# Verificando a mudança
nadult_train.isnull().sum()

## Classificador

Seleção de atributos

Transformando dados não numéricos para numéricos

In [None]:
numadult_train = nadult_train.apply(preprocessing.LabelEncoder().fit_transform)
numadult_test = nadult_test.apply(preprocessing.LabelEncoder().fit_transform)

In [None]:
Xadult_train = numadult_train[["age","workclass","education.num","marital.status","occupation", "relationship", "race", "sex", "capital.gain","capital.loss","hours.per.week"]]
Yadult_train = numadult_train.income

In [None]:
Xadult_test = numadult_test[["age","workclass","education.num","marital.status","occupation", "relationship", "race", "sex", "capital.gain","capital.loss","hours.per.week"]]

In [None]:
Xadult_train.head()

Fazendo o mesmo com os dados de teste

In [None]:
for k in range (10, 31):
    knn = KNeighborsClassifier(n_neighbors = k)
    mean_score = cross_val_score(knn, Xadult_train, Yadult_train, cv=5, scoring = 'accuracy').mean()
    print(f"k = {k}, acc = {mean_score}")

O melhor k encontrado dentre os valores testados portanto foi 26 com 83,338% de acurácia, portanto é o que será usado

In [None]:
knn = KNeighborsClassifier(n_neighbors = 26)

In [None]:
knn.fit(Xadult_train, Yadult_train)

In [None]:
Xadult_train.head()

In [None]:
Xadult_test.head()

In [None]:
Xadult_test.tail()

Prevendo os valores do income da base de teste

In [None]:
YPrediction = knn.predict(Xadult_test)
YPrediction

Ajustando o dataframe

In [None]:
incomes = []
for income in YPrediction:
    if income == 0:
        incomes.append('<=50K')
    else:
        incomes.append('>50K')

Adultsubmission = pd.DataFrame()
Adultsubmission[0] = Xadult_test.index
Adultsubmission[1] = incomes
Adultsubmission.columns = ['Id', 'income']
Adultsubmission.head()

In [None]:
Adultsubmission.tail()

In [None]:
# Salvando o csv
Adultsubmission.to_csv('Adultsubmission.csv', index = False)