In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm

In [None]:
dados = pd.read_csv('Student_Performance.csv')

In [None]:
dados.head()

In [None]:
dados.describe().round(2)

In [None]:
dados.info()

In [None]:
dados['Extracurricular Activities'] = dados['Extracurricular Activities'].str.replace('Yes', '1')
dados['Extracurricular Activities'] = dados['Extracurricular Activities'].str.replace('No', '0')

dados['Extracurricular Activities']  = dados['Extracurricular Activities'].astype(int)

In [None]:
dados.info()

In [None]:
dados.head()

In [None]:
ax = sns.boxplot(data = dados['Performance Index'])

In [None]:
ax = sns.distplot(dados['Performance Index'])

In [None]:
ax = sns.pairplot(dados, y_vars = 'Performance Index', x_vars = ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced'])

In [None]:
y = dados['Performance Index']
X = dados[['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2811)

modelo = LinearRegression()
modelo.fit(X_train, y_train)

In [None]:
# Testes informais
print('R²: {0:.2f}'.format(modelo.score(X_train, y_train)))

R²: 0.99


In [None]:
y_previsto = modelo.predict(X_test)

In [None]:
# mais testes informais
EQM = metrics.mean_squared_error(y_test, y_previsto).round(2)
REQM = np.sqrt(metrics.mean_squared_error(y_test, y_previsto)).round(2)
R2 = metrics.r2_score(y_test, y_previsto).round(2)

pd.DataFrame([EQM, REQM, R2], ['EQM', 'REQM', 'R²'], columns = ['Métricas'])

In [None]:
dados.describe().round(1)

In [None]:
# ótimos resultados nos testes informais
# agora, vamos para o teste formal

dados['log_Performance'] = np.log(dados['Performance Index'])
dados['log_Previous Scores'] = np.log(dados['Previous Scores'])
dados['log_Extracurricular Activities'] = np.log(dados['Extracurricular Activities'] + 1)
dados['log_Sleep Hours'] = np.log(dados['Sleep Hours'])
dados['log_Question Practiced'] = np.log(dados['Sample Question Papers Practiced'] + 1)

In [None]:
y_log = dados['log_Performance']
X_log = dados[['log_Previous Scores', 'log_Extracurricular Activities', 'log_Sleep Hours', 'log_Question Practiced']]

X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_log, y_log, test_size=0.3, random_state=2811)

modelo.fit(X_train_log, y_train_log)
y_previsto_log = modelo.predict(X_test_log)

In [None]:
X_train_com_const = sm.add_constant(X_train_log)
modelo_statsmodels = sm.OLS(y_train_log, X_train_com_const, hasconst = True).fit()

# Vizualizando as informções do modelo
print(modelo_statsmodels.summary())

In [None]:
entrada = X_test_log[0:1]
np.exp(modelo.predict(entrada)[0])

In [None]:
dados.query('index == 750')

In [None]:
index = ['Intercepto', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Question Practiced']
pd.DataFrame(data = np.append(np.exp(modelo.intercept_), modelo.coef_), index = index, columns = ['Parâmetros'])

In [None]:
ax = sns.scatterplot(x=y_previsto_log, y= y_test)
ax.figure.set_size_inches(12, 6)
ax.set_title('Previsão X Real', fontsize=18)
ax.set_xlabel('Quantidade de Kills - Previsão', fontsize=14)
ax.set_ylabel('Quantidade de Kills - Real', fontsize=14)
ax

In [None]:
residuos = y_previsto_log - y_test_log

In [None]:
ax = sns.scatterplot(x=np.exp(y_previsto_log), y=residuos, s = 150)
ax.figure.set_size_inches(20, 8)
ax.set_title('Resíduos X Previsão', fontsize=18)
ax.set_xlabel('Notas - Previsão', fontsize=14)
ax.set_ylabel('Resíduos', fontsize=14)
ax

In [None]:
ax = sns.scatterplot(x=np.exp(y_previsto_log), y=residuos**2, s = 150)
ax.figure.set_size_inches(20, 8)
ax.set_title('Resíduos X Previsão', fontsize=18)
ax.set_xlabel('Kills - Previsão', fontsize=14)
ax.set_ylabel('Resíduos²', fontsize=14)
ax

In [None]:
ax = sns.distplot(residuos)
ax.figure.set_size_inches(12, 6)
ax.set_title('Distribuição de Frequências dos Resíduos', fontsize=18)
ax.set_xlabel('log do Preço', fontsize=14)
ax