In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Predicting test scores of students
We will implement machine learning in order to predict the test scores of student using various features of the Dataset. 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error

## Loading the Dataset
We will use the `student_id` as the index of the dataset. 

In [None]:
df = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv', index_col='student_id')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.select_dtypes(['object'])

# Exploratory Data Analysis 

In [None]:
plt.style.use('seaborn-pastel')
sns.histplot(df.posttest).set(title='Posttest Distribution')

In [None]:
df.groupby(by='gender')['posttest'].mean().plot(kind='bar', title= 'Mean posτtest results by gender', color=['pink','lightblue'] )

In [None]:
df.groupby(by='teaching_method')['posttest'].mean().plot(kind='bar', title='Mean posttest by teaching method',color=['pink','lightblue'])


In [None]:
df.groupby(by='school_setting')['posttest'].mean().plot(kind='bar', title='Mean posttest by school setting',color=['lightgreen','pink', 'lightblue'])

In [None]:
df.groupby(by='n_student')['posttest'].mean().plot(title='Mean posttest by number of students in class')

In [None]:

df.groupby(by='n_student')['posttest'].mean().reset_index().plot(kind='scatter', x='n_student', y='posttest',title='Mean posttest by number of students in class', color='salmon')


# Data Preprocessing

In [None]:
df.isna().sum()

No missing values, so no need to use imputing.  

## Categorical Features
We will split the `object` columns to `one_hot_features` and `ordinal_encoding_features` based on the unique values they contain. 

In [None]:
numerical_columns = list(df.select_dtypes(exclude=['object']).columns)
numerical_columns

In [None]:
one_hot_features = list(df.select_dtypes(['object']).loc[:,df.apply(lambda x: x.nunique() <10 )].columns)
one_hot_features

In [None]:
ordinal_encoding_features = list(df.select_dtypes(['object']).loc[:,df.apply(lambda x: x.nunique() > 10)].columns)
ordinal_encoding_features

Making the Preprocessor Pipeline 

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(handle_unknown= 'ignore'), one_hot_features ),
        ('ordinal_encoding', OrdinalEncoder(handle_unknown='ignore'), ordinal_encoding_features )],
    remainder='passthrough')

Create the `model` Pipeline

In [None]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', RandomForestRegressor())])

# Feature and Target selection

In [None]:
X = df.drop('posttest', axis=1)
y = df.posttest 

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Model Fitting and Evaluation

In [None]:
model.fit(x_train,y_train)
model.score(x_test,y_test)

In [None]:
model_preds = model.predict(x_test)

In [None]:
metrics = {
    'Mean_Absolute_Error': mean_absolute_error(y_test, model_preds),
    'Mean_Squared_Error': mean_squared_error(y_test,model_preds),
    'R^2': model.score(x_test,y_test)
}
metrics

In [None]:
cross_val_score(model, x_train, y_train)

We will get feature names through the Pipeline to construct Feature Importance...

In [None]:
model.steps[1][1].feature_importances_.shape

In [None]:
one_hot_features = list(model.steps[0][1].named_transformers_['one_hot'].get_feature_names())

In [None]:
numerical_columns = list(x_train.select_dtypes(exclude=['object']).columns)
numerical_columns

As the Pipeline is constructed the features are `one_hot_features` then `ordinal_encoding_features` and finaly `numerical_columns`.

In [None]:
features = np.array(one_hot_features+ordinal_encoding_features+numerical_columns)
features.shape

# Feature Importance 

In [None]:
# Helper function for plotting feature importance
def plot_features(columns, importances, n=20):
    df = (pd.DataFrame({"features": columns,
                        "feature_importance": importances})
          .sort_values("feature_importance", ascending=False)
          .reset_index(drop=True))
    
    sns.barplot(x="feature_importance",
                y="features",
                data=df[:n],
                orient="h"
                ).set(title='Feature Importance')

In [None]:
plot_features(features,model.steps[1][1].feature_importances_)