In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem definition:

We want to predict the post test results from an student.

<h2>Evaluation:</h2>

The results are in the dataset, our model need to be accurate when comparing to the actual results.
 

<h2>Data Description:</h2>

* **school** - Name of the school the student is enrolled in.
* **school_setting** - The location of the school.
* **school_type** - The type of school. Either public or non-public.
* **classrooom** - The type of classroom.
* **teaching_method** - Teaching methods: Either experimental or Standard.
* **n_student** - Number of students in the class.
* **student_íd** - A unique ID for each student.
* **gender** - The gender of the students: male or female.
* **lunch** - Whether a student qualifies for free/subsidized lunch or not.
* **pretest** - The pretest score of the students out of 100

In [None]:
# Data importing

score_data = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')
score_data.head()

In [None]:
score_data.describe(include='all')

In [None]:
# Exploring the results 
score_results = score_data[['student_id','pretest', 'posttest']]
score_variation = score_results['posttest'] - score_results['pretest']
score_variation.describe()

Looking at the distribuition bellow, it's possible to see an evolution between the scores

In [None]:
sns.color_palette("tab10")
plt.figure(figsize=(8,4))
plt.title('Score Variation')
plt.xlabel('Test Score')
g = sns.kdeplot(score_data['pretest'], color='orange')
h = sns.kdeplot(score_data['posttest'])
plt.legend(title= 'Pre test', labels=['Pre test','Post test'])

The post test had an incremental in the mean, let's explore wich variables have more impact in this evolution. 

# School Setting analysis

In [None]:
# Score distribuition
plt.figure(figsize=(12,6))
plt.ylim(0,105)
sns.stripplot(x='school_setting', y='pretest', hue='teaching_method', jitter=0.25, dodge=True, data=score_data, palette='dark')
plt.ylabel('Test Score')
plt.xlabel('')
plt.title('Pre test Score comparison per School setting ')

In [None]:
# Score distribuition
plt.figure(figsize=(12,6))
plt.ylim(0,105)
sns.stripplot(x='school_setting', y='posttest', hue='teaching_method',alpha= 0.45, jitter=0.25, dodge=True, data=score_data, palette='dark')
plt.ylabel('Test Score')
plt.xlabel('')
plt.title('Post test Score per School setting ')

In [None]:
# Comparision
plt.figure(figsize=(14,7))
plt.ylim(0,105)
g = sns.stripplot(x='school_setting', y='pretest', hue='teaching_method', alpha= 1, jitter=0.25, dodge=True, data=score_data, palette='dark')
h = sns.stripplot(x='school_setting', y='posttest', hue='teaching_method',alpha= 0.2, jitter=0.35, dodge=True, data=score_data)
plt.ylabel('Test Score')
plt.xlabel('')
plt.legend(title='Method [Pre - Post]', loc= 8)
plt.title('Test Score comparison per School setting ')

With this graph, we can see that all school settings had improvments. Urban schools with experimental methodology have a bigger amplitude in the pre test results.

# Socio-economic Analysis

In [None]:
# The only variable is lunch
sns.stripplot(x='lunch', y='pretest', hue='teaching_method', jitter=0.25, dodge=True, data=score_data, palette='dark')

In [None]:
sns.stripplot(x='lunch', y='posttest', hue='teaching_method', alpha=0.5, jitter=0.25, dodge=True, data=score_data, palette='dark')

In [None]:
plt.figure(figsize=(14,7))
plt.ylim(0,105)
sns.stripplot(x='lunch', y='pretest', hue='teaching_method', jitter=0.25, dodge=True, data=score_data, palette='dark')
sns.stripplot(x='lunch', y='posttest', hue='teaching_method', alpha=0.5, jitter=0.35, dodge=True, data=score_data)

Students who don't have an meal subsidy had better overal scores in both tests.

In [None]:
plt.figure(figsize=(12,6))
plt.ylim(0,105)
sns.stripplot(x='school_setting', y='pretest', hue='school_type', jitter=0.25, dodge=True, data=score_data, palette='dark')
sns.stripplot(x='school_setting', y='posttest', hue='school_type',alpha= 0.5, jitter=0.35, dodge=True, data=score_data)
plt.ylabel('Test Score')
plt.xlabel('')
plt.title('Pre-post score comparison for type and setting')


In [None]:
plt.figure(figsize=(14,7))
plt.ylim(0,105)
sns.stripplot(x='lunch', y='pretest', hue='school_type', jitter=0.25, dodge=True, data=score_data, palette='dark')
sns.stripplot(x='lunch', y='posttest', hue='school_type', alpha=0.5, jitter=0.35, dodge=True, data=score_data)


In [None]:
plt.figure(figsize=(14,7))
plt.ylim(0,105)
sns.stripplot(x='n_student', y='pretest', hue='school_type', jitter=0.25, dodge=True, data=score_data, palette='dark')
sns.stripplot(x='n_student', y='posttest', hue='school_type', alpha=0.5, jitter=0.35, dodge=True, data=score_data)


Public schools have more students per class, and this can have an impact at the tests scores. 

In [None]:
plt.figure(figsize=(14,7))
plt.ylim(0,105)
sns.stripplot(x='gender', y='pretest', hue='school_type', jitter=0.25, dodge=True, data=score_data, palette='dark')
sns.stripplot(x='gender', y='posttest', hue='school_type', alpha=0.5, jitter=0.35, dodge=True, data=score_data)

Clearly gender don't have an impact in the scores, since we have similar distributions for bot variables. 

# Feature selection:

Features that have an impact on the overall score: 
* **School details**: Paid Urban Schools and Public Suburban Schools, presents better post test scores, and a similar disttibuition. 
* **Methodology**: Traditional teaching have a memorization-focused approach, while "Experimental" methodologies tends to focus on significative learning. Schools with experimental methodologies presents a better score evolution
* **Number of coleagues in the same classroom**: Teachers with fewer students tends to help the class development, crowded classes have worst results  
* **Socio-economic situation**: Students who don't apply for a subsidized lunch have better results in both tests.


<h3>To watch:</h3>
- The pretest result can reveal an "adequation" of the curriculum to the standardized test, maybe further investigations can reveal this relation.


In [None]:
score_data_hypotesis = score_data[['school_setting','school_type','teaching_method','n_student','lunch','pretest', 'posttest']]
score_data_dummies = pd.get_dummies(score_data_hypotesis)

In [None]:
test_data_correlation = score_data_dummies.corr()


In [None]:
plt.figure(figsize=(14,7))
sns.heatmap(test_data_correlation, annot= True)

In [None]:
score_features = score_data[['school_setting','school_type','teaching_method','n_student','lunch','pretest']]

# Evaluation metrics

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score

In [None]:
def evaluating(y_act,y_pred):
    MAE = mean_absolute_error(y_act, y_pred)
    MSE = mean_squared_error(y_act, y_pred)
    r2 = r2_score(y_act, y_pred)
    eval_values = {'MAE': MAE, 'MSE' : MSE, 'r2' : r2}
    print('Results:\n Mean absolute error = ', round(MAE,3),'\n Mean squared error = ', round(MSE,3),'\n R2 = ', round(r2,3))

    return eval_values


### Adjusting data

In [None]:
X = score_features
Y = score_data[['posttest']]
X = pd.get_dummies(X)
X.head(3)

# Train-test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size =0.35, random_state=5)

# Linear Regression

In [None]:
# With pretest score
reg_linear= LinearRegression()

reg_linear.fit(x_train, y_train)

y_pred = reg_linear.predict(x_test)

multi_eval = evaluating(y_test,y_pred)

reg_linear.score(x_train, y_train)

In [None]:
x_train_hypotesis = x_train[['n_student','school_setting_Rural','school_setting_Suburban','school_setting_Urban','school_type_Non-public',
                             'school_type_Public','teaching_method_Experimental','teaching_method_Standard','lunch_Does not qualify',
                             'lunch_Qualifies for reduced/free lunch']]

x_test_hypotesis = x_test[['n_student','school_setting_Rural','school_setting_Suburban','school_setting_Urban','school_type_Non-public',
                           'school_type_Public','teaching_method_Experimental','teaching_method_Standard','lunch_Does not qualify',
                           'lunch_Qualifies for reduced/free lunch']]

In [None]:
# Without pre test score

reg_linear= LinearRegression()

reg_linear.fit(x_train_hypotesis, y_train)

y_pred = reg_linear.predict(x_test_hypotesis)

multi_eval = evaluating(y_test,y_pred)

reg_linear.score(x_train_hypotesis, y_train)

# Bayesian Ridge Regression

In [None]:
bay_rid_reg= BayesianRidge()

bay_rid_reg.fit(x_train, y_train['posttest'])

y_pred = bay_rid_reg.predict(x_test)

multi_eval = evaluating(y_test,y_pred)

bay_rid_reg.score(x_train, y_train)

# Logistic Regression

In [None]:
log_reg = LogisticRegression(solver='liblinear')  

log_reg.fit(x_train, y_train['posttest'])  

y_log_pred = log_reg.predict(x_test)

log_eval = evaluating(y_test,y_log_pred)

log_reg.score(x_train, y_train)

# Conclusion

- The results show that students that don't recieve any type of lunch subside have better grades. 
- Public schools have more students per classroom
- Gender don't have an impact in the test score
- The linear regression model have a high accuracy for this dataset, since the pre test data have a big correlation with the post test. 
