# A.) Data Visualization and Clustering

# Introduction to the dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

It contains the performance of students from two high schools, gathered from a multiple-choice questionnaire.

1. school - student's school (GP or MS)
2. sex - student's gender (female or male)
3. age - student's age (15-22)
4. address - student's residence (rural or urban)
5. famsize - family size (greater than 3 or less than 3)
6. Pstatus - parents' cohabitation status (together or apart)
7. Medu - mother's highest education level (0 - none, 1 - primary (4th grade), 2 - 5th-9th grade, 3 - high school, 4 - higher education)
8. Fedu - father's highest education level (0 - none, 1 - primary (4th grade), 2 - 5th-9th grade, 3 - high school, 4 - higher education)
9. Mjob - mother's job (teacher, health, public service, at home, other)
10. Fjob - father's job (teacher, health, public service, at home, other)
11. reason - reason for choosing the school (close to home, school reputation, course, other)
12. guardian - student's guardian (mother, father, other)
13. traveltime - travel time (1 - <15 minutes, 2 - 15 to 30 minutes, 3 - 0.5-1 hour, 4 - >1 hour)
14. studytime - weekly study time (1 - <2 hours, 2 - 2-5 hours, 3 - 5-10 hours, 4 - >10 hours)
15. failures - number of previous failures (n - 1<=n<3, otherwise 4)
16. schoolsup - extra educational support (yes or no)
17. famsup - family support (yes or no)
18. paid - paid extra classes in course subjects (yes or no)
19. activities - extracurricular activities (yes or no)
20. nursery - attended nursery school (yes or no)
21. higher - wants to attend higher education (yes or no)
22. internet - has internet at home (yes or no)
23. romantic - in a relationship (yes or no)
24. famrel - quality of family relationships (1 - very bad, 5 - excellent)
25. freetime - free time (1 - very little, 5 - very high)
26. goout - going out with friends (1 - very little, 5 - very high)
27. 28. Dalc, Walc - weekday and weekend alcohol consumption (1 - very little, 5 - very high)
29. health - current health status (1 - very bad, 5 - very good)
30. absences - school absences (0-93)

31. G1 - first semester grade (0-20)
32. G2 - second semester grade (0-20)
33. G3 - final grade (0-20)

In [None]:
df = pd.read_csv('data.csv')

In [None]:
# Averaging the grades ('fail' - 0-9, 'sufficient' - 10-11, 'satisfactory' - 12-13, 'good' - 14-15, 'excellent' - 16-20)

def create_average():
    columns = ['G1', 'G2', 'G3']
    df['annual_grades_avg'] = df[columns].mean(axis=1)

create_average()

In [None]:
def five_level_classification():
    bins = pd.IntervalIndex.from_tuples(
        [(0, 9.5), (9.5, 11.5), (11.5, 13.5), (13.5, 15.5), (15.5, 20)], closed='right')

    levels = ['fail', 'sufficient', 'satisfactory', 'good', 'excellent']

    new_column = 'annual_grades_evaluation'
    df[new_column] = np.array(levels)[pd.cut(df['annual_grades_avg'], bins=bins).cat.codes]

five_level_classification()

In [None]:
df.head()

In [None]:
# 395 records, 33 attributes
df.shape

In [None]:
df.info()

In [None]:
df.describe()

# Data Visualization

In [None]:
def plot_categorical_insight(categorical_columns):

    nrows, ncols = categorical_columns.shape[1], 3
    _, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(25, 85))

    for idx, column in enumerate(categorical_columns):
        ax = axes[idx]
        sns.countplot(data=df, x='annual_grades_evaluation', hue=column, ax=ax[0])

        sns.countplot(data=df, x=column, ax=ax[1])

        sns.boxplot(data=df, x=column, y='annual_grades_avg', ax=ax[2])
        
columns = df.select_dtypes(include='object')
columns = columns.drop('annual_grades_evaluation', axis=1)
plot_categorical_insight(columns)

- One school outperforms the other (even in terms of failures)
- Girls fail more often than boys
- Rural students perform better
- Those engaged in extracurricular activities perform better

In [None]:
levels = ['fail', 'sufficient', 'satisfactory', 'good', 'excellent']

def plot_grades_to_self():
    nrows, ncols = 1, 2
    _, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 8))
    sns.countplot(data=df, x='annual_grades_evaluation', ax=axes[0], order=levels)

    sns.boxplot(data=df, x='annual_grades_evaluation', y='annual_grades_avg', ax=axes[1], order=levels)

plot_grades_to_self()

Clearly, many students have failed.

In [None]:
corr=df.corr()

plt.figure(figsize=(30,30))
sns.heatmap(corr, annot=True, cmap='crest')

- All three grades are highly positively correlated with each other.
- Parents' highest education levels are moderately positively correlated.
- Weekend and weekday alcohol consumption are positively correlated. Those who consume a lot on weekends do so on weekdays too.
- Failures, unsurprisingly, are negatively correlated with grades; why would better grades follow more failures?

In [None]:
dfd = df[['annual_grades_avg','absences', 'studytime', 'failures']]
sns.pairplot(data=dfd);

Regarding clustering, the plot of 'absences' and 'annual_grades_avg' is more interesting to me, so I will continue with this.

# Preprocessing, Data Cleaning

There are no missing values.

In [None]:
df.isnull().any()

From this point, we will examine the averaged grade column.

In [None]:
column_to_drop = ['G1', 'G2', 'G3']
df = df.drop(column_to_drop,  axis=1)

Removing extreme values based on the interquartile range.

In [None]:
def detect_outliers(columns):
    outlier_indices = []
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 =df[column].quantile(0.75)
        IQR = Q3 - Q1  # interkvartilis tábolság

        mask = (df[column] >= Q1 - 1.5 *
                IQR) & (df[column] <= Q3 + 1.5 * IQR)
        mask = mask.to_numpy()
        false_indices = np.argwhere(~mask)
        outlier_indices.append(false_indices)
    return np.unique(np.concatenate(outlier_indices).ravel())

numerical_columns = ['age', 'absences', 'annual_grades_avg']
outlier_indices = detect_outliers(numerical_columns)

print(f'Number of outliers: {len(outlier_indices)}')

df = df.drop(outlier_indices, axis=0)

Converting all variables to float type.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for i in df:
    df[i] = le.fit_transform(df[i]).astype(float)
df.dtypes

# Clustering

K-means, or k-means clustering, is a centroid-based clustering algorithm (data points cluster around a centroid). During clustering, it only matters which centroid is closest to each point based on Euclidean distance.

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(df[['absences','annual_grades_avg']])

kmeans.cluster_centers_

In [None]:
def plot_result(kmeans):
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=df['absences'], y=df['annual_grades_avg'], hue=kmeans.labels_, palette="Set1", marker='+');
    sns.scatterplot(x=kmeans.cluster_centers_[:,0], y=kmeans.cluster_centers_[:,1], marker='o', s=100, c=['black'])
    
plot_result(kmeans)

In [None]:
err = []
for i in range(1,20):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(df[['absences','annual_grades_avg']]);
    err.append([kmeans.n_clusters, kmeans.inertia_])

err = np.asarray(err)
sns.lineplot(x=err[:,0], y=err[:,1])
sns.scatterplot(x=err[:,0], y=err[:,1])

The plot suggests that 5 clusters seem reasonable.

In [None]:
db = DBSCAN(eps = 5, min_samples = 10).fit(df)
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_

sns.scatterplot(data = df, x = 'absences', y = 'annual_grades_avg', hue = db.labels_, legend = 'full', palette = 'deep')

# Regression and Classification Systems

Splitting the dataset into training and test sets in a 70% training and 30% test ratio, as the training set would not be large enough to make the estimated accuracy unreliable otherwise.

In [None]:
train, test = train_test_split(df, test_size=0.3)

Normalizing the data using MinMaxScaler. Each feature is scaled individually and transformed to be within the specified range on the training set.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train = pd.DataFrame(scaler.fit_transform(train), columns = train.columns)

In [None]:
scaler = MinMaxScaler()
test = pd.DataFrame(scaler.fit_transform(test), columns = test.columns)

Function for building the classification model and achieving performance results.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold as KFold   # K-fold keresztérvényesítéshez
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV


def classification_model(model, data, predictors, outcome, param_grid):

# Modellillesztés:
  model.fit(data[predictors],data[outcome])
  
  # 'Jóslás' a tanulóhalmazon:
  predictions = model.predict(data[predictors])

  # Pontosság kiíratása
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print ("Accuracy : %s" % "{0:.3%}".format(accuracy))
    
  # K-fold keresztérvényesítás 4 'folddal'
  kf = KFold(n_splits=4, n_repeats=2)
  error = []
  for train, test in kf.split(data[predictors], data[outcome]):
    # Tanulóhalmaz szűrése
    train_predictors = (data[predictors].iloc[train,:])
    
    # A célpont, amit az algoritmus betanításához használunk.
    train_target = data[outcome].iloc[train]
    
    # Az algoritmus betanítása a prediktorok és a célpont segítségével
    model.fit(train_predictors, train_target)
    
    # Hiba rögzítése minden keresztellenőrzési futtatásból
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))

  print ("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

  # Modellillesztés, hogy a függvényen kívül is hivatkozhassunk rá
  model.fit(data[predictors],data[outcome])

  # Hiperparaméterek finomhangolása
  grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring="accuracy",cv=4)

  grid.fit(data[predictors],data[outcome])
  print(f"BEST SCORE: %s" % "{0:.3%}".format(grid.best_score_))

In [None]:
model = LinearRegression()

In [None]:
model = LogisticRegression()
outcome_var = 'higher'
predictor_var = ['address']
param_grid = [{'penalty':['none','l2']}, 
              {'C':[1, 10, 100, 1000]}]
classification_model(model, train, predictor_var, outcome_var, param_grid)

Bias

In [None]:
model.intercept_

The coefficient of the features in the decision function.

In [None]:
model.coef_

The format of the trained model here is a decision tree. A decision tree is a tree where each internal node represents a feature. The children of a node represent the possible values of that feature. The leaves contain class labels.

In [None]:
model = DecisionTreeClassifier()
predictor_var = ['school', 'sex', 'address', 'activities']
param_grid = [{'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}]
classification_model(model, train ,predictor_var, outcome_var, param_grid)

The essence of so-called random forests is that it creates several different decision trees, unlike the simple decision tree method, and provides the final result by averaging their outcomes. Since decision trees are very sensitive to changes in the training dataset, it replaces random elements in the dataset with other repeated elements. Different features are used for training during the creation of each decision tree.

In [None]:
model = RandomForestClassifier(n_estimators=100)
predictor_var = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery']
param_grid = [{'n_estimators': [5,20,50,100],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4]}]
classification_model(model, train, predictor_var, outcome_var, param_grid)

In general, it can be stated that for this dataset, the random forest provides better accuracy than the decision tree and logistic regression.