In [None]:
import duckdb
# import sklearn
import dask.dataframe as dd
import pandas as pd

pd.set_option('future.no_silent_downcasting', True)
pd.options.display.max_columns = None       # type: ignore
pd.set_option('display.precision', 2)

In [None]:
# Connect to DuckDB database
conn = duckdb.connect(database="../database/bdt.duckdb", read_only=False)

# Execute SQL query and fetch data into a Pandas DataFrame
query = "SELECT * FROM student_perf"
student_perf_df = conn.execute(query).fetchdf()

# Convert the Pandas DataFrame to a Dask DataFrame
student_perf_dask = dd.from_pandas(student_perf_df, npartitions=3)

# To compute and get the result, you can use compute() method
student_perf_dask.head()

In [None]:
student_perf_dask.dtypes

In [None]:
student_perf_dask.describe().compute()

In [None]:
import seaborn as sns

corr_df = student_perf_dask.drop('AVG_G', axis=1)

# plot correlation matrix
sns.set_theme(rc={'figure.figsize':(12, 12)})
sns.heatmap(corr_df.corr().compute(), annot=True, annot_kws={"fontsize":7}, fmt=".2f", cmap="PiYG", square=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, make_scorer, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import joblib

In [None]:
# Example data loading (replace with your actual data)
X = student_perf_dask.drop(['passed','AVG_G'], axis=1).compute()
y = student_perf_dask['passed'].compute()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

# Define the models and their hyperparameters
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['liblinear']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {}
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [None, 10, 20, 30]
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=1000),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'activation': ['tanh', 'relu'],
            'solver': ['adam'],
            'momentum': [0.9]
        }
    }
}

# Define the F1 scorer
f1_scorer = make_scorer(f1_score)
acc_scorer = make_scorer(accuracy_score)

# GridSearchCV for each model
def evaluate_model(model, params, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(model, params, cv=5, scoring=f1_scorer)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_pred)
    return best_model, grid_search.best_params_, grid_search.best_score_, conf_mat

# model evaluation
results = []
for model_name, model_info in models.items():
    best_model, best_params, best_score, conf_mat = evaluate_model(model_info['model'], model_info['params'], X_train, y_train, X_test, y_test)
    results.append({
        'Model': model_name,
        'Best Params': best_params,
        'Best F1-Score': best_score,
        'Confusion Matrix': conf_mat
    })

    # Plot Confusion Matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

    # save the model
    joblib.dump(best_model, f'best_{model_name}_model_without_AVG_G.pkl')

# show results
results_df = pd.DataFrame(results)
print(results_df[['Model', 'Best Params', 'Best F1-Score']])

In [None]:
# Example data loading (replace with your actual data)
X = student_perf_dask.drop('passed', axis=1).compute()
y = student_perf_dask['passed'].compute()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

# Define the models and their hyperparameters
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['liblinear']
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {}
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=1000),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'activation': ['tanh', 'relu'],
            'solver': ['adam'],
            'momentum': [0.9]
        }
    }
}

# Define the F1 scorer
f1_scorer = make_scorer(f1_score)
acc_scorer = make_scorer(accuracy_score)

# GridSearchCV for each model
def evaluate_model(model, params, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(model, params, cv=5, scoring=f1_scorer)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_pred)
    return best_model, grid_search.best_params_, grid_search.best_score_, conf_mat

# model evaluation
results = []
for model_name, model_info in models.items():
    best_model, best_params, best_score, conf_mat = evaluate_model(model_info['model'], model_info['params'], X_train, y_train, X_test, y_test)
    results.append({
        'Model': model_name,
        'Best Params': best_params,
        'Best F1-Score': best_score,
        'Confusion Matrix': conf_mat
    })

    # Plot Confusion Matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

    # save the model
    joblib.dump(best_model, f'best_{model_name}_model_with_AVG_G.pkl')

# show results
results_df = pd.DataFrame(results)
print(results_df[['Model', 'Best Params', 'Best F1-Score']])

In [None]:
conn.close()