### Imports

In [None]:
#!pip install pycaret
!pip install datacleaner

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 1000)
#import pycaret
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from datacleaner import autoclean
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from google.colab import drive
drive.mount('/content/drive')


### Loading data

In [None]:
#df_train= pd.read_csv('/content/drive/MyDrive/hackapucsp2023-enem/train.csv')
df_train= pd.read_csv('/content/drive/MyDrive/Data Science/hackathon/hackapucsp2023-enem/train.csv')

In [None]:
#df_test= pd.read_csv('/content/drive/MyDrive/test.csv')
df_test= pd.read_csv('/content/drive/MyDrive/Data Science/hackathon/hackapucsp2023-enem/test.csv')



###Análise Exploratória

In [None]:
df_train.columns

In [None]:
df_train['SG_UF_ESC'].value_counts()

In [None]:
df_train[df_train['NU_NOTA_REDACAO']>=900]['MF'].mean()

In [None]:
def piechart(x):
  fig = px.pie(x, values=x['SG_UF_ESC'].value_counts(), names=x['SG_UF_ESC'].value_counts().index, title='distribuição por estado')
  fig.show()

In [None]:
ricos = df_train[df_train['Q006']=='Q']#aqui selecionei as pessoas cuja renda familiar é superior a R$22.000,00
pobres = df_train[(df_train['Q006']=='B') | (df_train['Q006']=='A')]#aqui estou selecionando apenas as pessoas que responderam que a renda familiar é igual ou inferior a R$1.100,00


In [None]:
piechart(ricos)

In [None]:
piechart(pobres)

In [None]:
fig = px.bar(df_train, x=df_train['SG_UF_ESC'].value_counts().index, y=df_train['SG_UF_ESC'].value_counts(), color= df_train['SG_UF_ESC'].value_counts().index)
fig.show()

#### Diferença de médias entre pobres e ricos


In [None]:
pobres['MF'].mean()

In [None]:
ricos['MF'].mean()

In [None]:
def bargraph(df, xlabel, ylabel):
  fig = px.bar(x=df.keys(), y=df.values(), color= df.keys(), labels={'x':xlabel, 'y': ylabel})
  fig.show()

In [None]:
#pegando a media de cada estado
medias_estado = {}
lista_de_estados= df_train['SG_UF_ESC'].unique()
for i in lista_de_estados:
  media = df_train[df_train['SG_UF_ESC']==i]['MF'].mean()
  medias_estado[i]=media

In [None]:
medias =sorted(medias_estado.items(), key=lambda x:x[1])

In [None]:
medias = dict(medias)

In [None]:
def bargraph(df, xlabel, ylabel):
  fig = px.bar(x=df.keys(), y=df.values(), color= df.keys(), labels={'x':xlabel, 'y': ylabel})
  fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})
  fig.show()

In [None]:
bargraph(medias, 'ESTADOS','NOTA ENEM')

#### Nota de matemática pela renda

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.boxplot(x="Q006", y = "MF", data = df_train)
plt.title("Boxplot das notas pela renda")

In [None]:
df_train

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.lineplot(x="Renda per capita", y = "MF", data = df_train)
plt.title("Média final pela renda")
plt.savefig('media.png')

#### Quantidade de pessoas que não possuem internet em casa

In [None]:
df_train['Q025'].value_counts()

DIFERENÇA DAS MEDIAS ENTRE QUEM NÃO TEM INTERNET 

In [None]:
media_sem_internet = df_train[df_train['Q025']=='A']['MF'].mean() # Sem internet

media_com_internet = df_train[df_train['Q025']=='B']['MF'].mean() # Com internet

diferenca_medias_internet = media_com_internet - media_sem_internet

diferenca_medias_internet

In [None]:
def media_coluna(coluna_aspas_duplas):  #Function para retornar média final das notas por determinada feature
  hashmap = {}
  valores_contados = df_train[coluna_aspas_duplas].value_counts()
  for i in valores_contados.index:
    nota = df_train[df_train[coluna_aspas_duplas] == i]['MF'].mean()
    hashmap[i] = nota
  return(sorted(hashmap.items(), key=lambda x:x[1]))

#### Quantidade de banheiros

In [None]:
media_coluna("Q008") # Coluna quantidade de banheiros na casa do candidato

In [None]:
media_coluna("Q003") # Profissão do pai

#### Escolaridade pai / mãe

In [None]:
media_coluna("Q001") # Escolaridade pai 

In [None]:
media_coluna("Q002") # Escolaridade mãe

#### Pessoas por residência


In [None]:
pessoas_por_residencia = media_coluna("Q005")
pessoas_por_residencia = dict(pessoas_por_residencia)
bargraph(pessoas_por_residencia, 'estados', 'nota enem')

In [None]:
color = "#2E3037"
background_color = "Dark"
score_targets = ["NU_NOTA_CN", "NU_NOTA_CH", "NU_NOTA_LC", "NU_NOTA_MT", "NU_NOTA_REDACAO"]
names = ["Ciências da Natureza", "Ciências Humanas", "Linguagens", "Matemática", "Redação"]
if background_color == "White":
    plt.style.use("default")
    plt.figure(figsize=(15, 8))

else:
    plt.style.use('dark_background')
    plt.figure(figsize=(15, 8), facecolor=color)
    ax = plt.gca()
    ax.set_facecolor(color)

for i, col in enumerate(score_targets):
        sns.distplot(df_train[col], hist = False, kde = True,
                     kde_kws = {'shade': True, 'linewidth': 2}, 
                      label = names[i])

plt.legend(names)
plt.xlabel("Nota")
plt.ylabel("Densidade")
plt.savefig("Densidade_notas_por_disciplina.png")

#### Divisão de renda





In [None]:
renda_familiar = media_coluna("Q006") # Média por renda familiar

### Separando os dados em features e target

In [None]:
colunas_retirar = ['MF','TP_PRESENCA_CN', 'TP_PRESENCA_CH',
       'TP_PRESENCA_LC', 'TP_PRESENCA_MT', 'CO_PROVA_CN', 'CO_PROVA_CH',
       'CO_PROVA_LC', 'CO_PROVA_MT', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC',
       'NU_NOTA_MT', 'TX_RESPOSTAS_CN', 'TX_RESPOSTAS_CH', 'TX_RESPOSTAS_LC',
       'TX_RESPOSTAS_MT', 'TX_GABARITO_CN', 'TX_GABARITO_CH',
       'TX_GABARITO_LC', 'TX_GABARITO_MT',
       'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4',
       'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'ID', 'CONCEITO', 'DESEMPENHO']

In [None]:
def Transformar_int(feature):
    df_train[feature] = df_train[feature].astype(int)

In [None]:
df_train['Q006']= df_train['Q006'].replace({"A": "0", "B":"1100","C":"1350","D":"2000","E":"2550","F":"3100",
                          "G":"3850","H":"4950","I":"5950","J":"7150","K":"8250","L":"9350",
                          "M":"10450","N":"12100","O":"14800","P":"18750","Q":"22000"}, inplace = False)
df_train['Q009']= df_train['Q009'].replace({"A": "0", "B":"1","C":"2","D":"3","E":"4"}, inplace = False)
df_train['Q024']= df_train['Q024'].replace({"A": "0", "B":"1","C":"2","D":"3","E":"4"}, inplace = False)
df_train['Q022']= df_train['Q022'].replace({"A": "0", "B":"1","C":"2","D":"3","E":"4"}, inplace = False)

Transformar_int('Q005')
Transformar_int('Q006')
Transformar_int('Q009')
Transformar_int('Q022')
Transformar_int('Q024')
df_train['Renda per capita'] = round(df_train['Q006']/df_train['Q005'])
#df_train['Pessoas por quarto']= round(df_train['Q005']/df_train['Q009'])
df_train['Computadores por pessoa']= round(df_train['Q024']/df_train['Q005'])
df_train['Celulares por pessoa']= round(df_train['Q022']/ df_train['Q005'])

In [None]:
df_test['Q006']= df_test['Q006'].replace({"A": "0", "B":"1100","C":"1350","D":"2000","E":"2550","F":"3100",
                          "G":"3850","H":"4950","I":"5950","J":"7150","K":"8250","L":"9350",
                          "M":"10450","N":"12100","O":"14800","P":"18750","Q":"22000"}, inplace = False)
df_test['Q009']= df_test['Q009'].replace({"A": "0", "B":"1","C":"2","D":"3","E":"4"}, inplace = False)
df_test['Q024']= df_test['Q024'].replace({"A": "0", "B":"1","C":"2","D":"3","E":"4"}, inplace = False)
df_test['Q022']= df_test['Q022'].replace({"A": "0", "B":"1","C":"2","D":"3","E":"4"}, inplace = False)
def Transformar_int_test(feature):
    df_test[feature] = df_test[feature].astype(int)
Transformar_int_test('Q005')
Transformar_int_test('Q006')
Transformar_int_test('Q009')
Transformar_int_test('Q022')
Transformar_int_test('Q024')
df_test['Renda per capita'] = round(df_test['Q006']/df_test['Q005'])
#df_test['Pessoas por quarto']= round(df_test['Q005']/df_test['Q009'])
df_test['Computadores por pessoa']= round(df_test['Q024']/df_test['Q005'])
df_test['Celulares por pessoa']= round(df_test['Q022']/ df_test['Q005'])

In [None]:
features = df_train.drop(colunas_retirar, axis=1)
features = autoclean(features)

In [None]:
target = df_train['DESEMPENHO']
#target = le.fit_transform(target)

adicionando o conceito no test


In [None]:
lista = []

for i in df_test['MF']:
  if i <= 500.0:
    lista.append('E')
  if (i > 500) and (i<=600):
    lista.append('D')

  if (i > 600) and (i<= 700):
    lista.append('C')
  if (i > 700) and (i<= 800):
    lista.append('B')
  if i > 800:
    lista.append('A')


df_test['CONCEITO'] = lista    

dividindo o teste

In [None]:
x_test = df_test.drop(colunas_retirar, axis=1)
x_test = autoclean(x_test)
y_test = df_test['DESEMPENHO']

In [None]:
#label enconder
le = preprocessing.LabelEncoder()

x_test = autoclean(x_test)
y_test = le.fit_transform(df_test['DESEMPENHO'])
x_train = features
y_train = le.fit_transform(target)

### Standard scaler

In [None]:
#colocando os dados em escala
scaler = StandardScaler()

scaler.fit(x_train)
X2=scaler.transform(x_train)

In [None]:
#X3 = scaler.fit_transform(x_test)
scaler.fit(x_test)
X3=scaler.transform(x_test)

####Minmax

In [None]:
from sklearn.preprocessing import MinMaxScaler
MinMaxScaler = MinMaxScaler()
X_1= MinMaxScaler.fit_transform(x_train)
X_2= MinMaxScaler.fit_transform(x_test)

### Treinando o modelo

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
# split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=8)

# create a decision tree classifier object
#
#clf = DecisionTreeClassifier()
#clf = RandomForestClassifier(max_depth=2, random_state=0)
#
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
# train the model using the training sets
clf.fit(x_train, y_train)

# make predictions on the testing set
y_pred = clf.predict(x_test)

# calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# print the accuracy of the model

#In this example, we are using the Iris dataset from scikit-learn library, splitting the data into training and testing sets, creating a DecisionTreeClassifier object, and training the model using the training data. Then, we are making predictions on the testing data and calculating the accuracy of the model. Finally, we are printing the accuracy of the model. You can replace the iris dataset with your own dataset to train a decision tree model.

print(classification_report(y_test, y_pred))

In [None]:
precision = precision_score(y_test , y_pred, average = "macro")
recall = recall_score(y_test , y_pred, average = "macro")
print(f"Accuracy:{accuracy}, precision: {precision}, Recall: {recall} ")

0.5486037667919264


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=clf.classes_)
disp.plot(values_format='')

In [None]:
testemodel = y_test.copy()
teste_modelo= pd.DataFrame(testemodel, columns= ['Real'])
#teste_modelo.drop(columns=['Class'], inplace=True)
teste_modelo['Previsão'] = y_pred
teste_modelo.head(50)

STD = 0.5014063589928219

In [None]:
d = {i:j for i,j in zip(features.columns, clf.feature_importances_)}
sorted(d.items(), key=lambda item: item[1], reverse=True)

### Feature selection

In [None]:
y_train

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
selecao_atributos = {}
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=20, step=5, verbose=5)
rfe_selector.fit(features, y_train)
rfe_support = rfe_selector.get_support()
rfe_feature_1 = x_train.loc[:,rfe_support].columns.tolist()

In [None]:
selecao_atributos['RFE'] = rfe_feature_1

In [None]:
selecao_atributos['RFE']

In [None]:
x_train = x_train[selecao_atributos['RFE']]
x_train

In [None]:
x_test = x_test[selecao_atributos['RFE']]

### PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
fit = pca.fit(features)

x_train = fit.transform(features)

In [None]:
#pca no teste

fit = pca.fit(x_test)
x_test = fit.transform(x_test)

In [None]:
x_train

### mlp


In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(4,3) ,max_iter=500, activation='tanh')
mlp.fit(x_train, y_train)

In [None]:
#testando
pred = mlp.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

### testes

In [None]:
x_train

In [None]:
variaveis_manter = ['TP_COR_RACA', 'Q001', 'Q002', 'Q003', 'TP_SEXO', 'TP_ST_CONCLUSAO', 'CO_UF_ESC', 'TP_LINGUA', 'Q006', 'TP_ESCOLA']

In [None]:
df = df_train[df_train['TP_COR_RACA', 'Q001', 'Q002', 'Q003', 'TP_SEXO', 'TP_ST_CONCLUSAO', 'CO_UF_ESC', 'TP_LINGUA', 'Q006', 'TP_ESCOLA']]

In [None]:
df_train['MF'].mean()

In [None]:
lista = []
for i in df_train['MF']:
  if i < 540:
    lista.append(0)#'Baixo Desempenho'
  if i >=540:
    lista.append(1)#'Desempenho Razoavel'

In [None]:
df_train['DESEMPENHO'] = lista

In [None]:
lista = []
for i in df_test['MF']:
  if i < 540:
    lista.append(0)#'Baixo Desempenho'
  if i >=540:
    lista.append(1)#'Desempenho Razoavel'

In [None]:
df_test['DESEMPENHO'] = lista

In [None]:
df_train