In [24]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.models import Sequential

class TagHolder:

    def __init__(self, _id, name):
        self.id = _id
        self.name = name

    def __str__(self):
        return self.name


def get_tags():
    connection = get_engine()
    if not connection:
        return []

    try:
        with connection.cursor() as cursor:
            cursor.execute("SELECT * from datasetTag")
            tags = cursor.fetchall()

            tag_holder = []
            for tag in tags:
                tag_holder.append(TagHolder(tag[0], tag[1]))

            return tag_holder
    except Exception as e:
        return []
    finally:
        if connection.is_connected():
            connection.close()
            
            
from typing import Union

import mysql.connector
from mysql.connector.abstracts import MySQLConnectionAbstract
from mysql.connector.pooling import PooledMySQLConnection

def get_engine():
    # Create a MySQL connection using mysql.connector
    connection = mysql.connector.connect(
        host='monorail.proxy.rlwy.net',
        port=45826,
        user='root',
        password='VoUeejgBIkMgYiPmYHxMFsIXffwxCKBK',
        database='railway'
    )
    return connection





In [25]:
def discretize_weights(weight):
    if weight < 50:
        return 0
    if 50 <= weight < 85:
        return 1
    if 85 <= weight <= 100:
        return 2

In [26]:
def fetch_data_as_dataframe(connection, query: str) -> pd.DataFrame:
    cursor = connection.cursor()
    cursor.execute(query)
    result_set = cursor.fetchall()
    column_names = cursor.column_names
    cursor.close()
    df = pd.DataFrame(result_set, columns=column_names)
    return df.dropna()

In [27]:
def discretize_weights(weight):
    if weight < 50:
        return 0
    if 50 <= weight < 95:
        return 1
    if 85 <= weight <= 100:
        return 2

In [28]:
query = f"""
                        SELECT ((grade-1)/4)*100 as weighted, attr_A, attr_B, attr_C, attr_E, attr_F, attr_H, attr_G, attr_I, attr_L, attr_M, attr_N, attr_O, attr_Q1, attr_Q2,
                                attr_Q3, attr_Q4, attr_EX, attr_AX, attr_TM, attr_IN, attr_SC,
                                cfit,
                                CASE when course = 'BSCS' then 1 else 0 end as course_bscs,
                                CASE when course = 'BSIT' then 1 else 0 end as course_bsit
                        FROM students
                        INNER JOIN assessments s on s.student_id = students.Id WHERE tagID in (5);
                        """
df = fetch_data_as_dataframe(get_engine(), query)
df = pd.get_dummies(df, columns=['cfit'])
df.dropna(inplace=True)

In [29]:
final_y = df['weighted'].apply(discretize_weights)
df.head()

Unnamed: 0,weighted,attr_A,attr_B,attr_C,attr_E,attr_F,attr_H,attr_G,attr_I,attr_L,...,attr_IN,attr_SC,course_bscs,course_bsit,cfit_A,cfit_AA,cfit_BA,cfit_H,cfit_L,cfit_M
0,0.0,4,4,5,3,5,4,6,5,7,...,4,5,1,0,False,False,False,False,True,False
1,60.000002,5,6,7,1,5,6,5,7,5,...,3,6,1,0,False,True,False,False,False,False
2,57.499999,2,5,6,4,1,2,7,4,7,...,4,7,1,0,True,False,False,False,False,False
3,64.999998,5,6,5,4,7,6,6,7,9,...,4,6,1,0,False,True,False,False,False,False
4,0.0,6,4,3,1,4,2,7,8,5,...,1,6,1,0,False,False,True,False,False,False


In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['weighted']), df['weighted'], test_size=0.2)

In [31]:
# success classifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
smote = SMOTE()
success_y = df['weighted'].apply(discretize_weights).apply(lambda x: x==2)
x_s, y_s = smote.fit_resample(df.drop(columns=['weighted']), success_y)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(x_s, y_s, test_size=0.2)

In [32]:
final_df = df.copy()
final_df_features = np.asarray(final_df.drop(columns=['weighted']), np.float64)

In [33]:
from keras.src.optimizers import Adam
from sklearn.metrics import accuracy_score

# Convert data types
X_train_s = np.asarray(X_train_s).astype(np.float64)
y_train_s = np.asarray(y_train_s).astype(np.int16)
X_test_s = np.asarray(X_test_s).astype(np.float64)
y_test_s = np.asarray(y_test_s).astype(np.int16)

# Create the model
model = Sequential([
    InputLayer(shape=(X_train_s.shape[1],)),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')  # Single output unit for binary classification
])

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model to the training data
model.fit(X_train_s, y_train_s, epochs=30, batch_size=32, validation_split=0.2, verbose=1, )

# Make predictions
y_pred_prob = model.predict(X_test_s)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to class labels
final_df['success'] = model.predict(final_df_features)

# Calculate accuracy
accuracy = accuracy_score(y_test_s, y_pred)
f'Accuracy: {accuracy}'

Epoch 1/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5208 - loss: 0.6932 - val_accuracy: 0.6200 - val_loss: 0.6362
Epoch 2/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7396 - loss: 0.5285 - val_accuracy: 0.8933 - val_loss: 0.3122
Epoch 3/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8581 - loss: 0.3315 - val_accuracy: 0.8267 - val_loss: 0.4414
Epoch 4/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8806 - loss: 0.2691 - val_accuracy: 0.9733 - val_loss: 0.1477
Epoch 5/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8928 - loss: 0.2678 - val_accuracy: 0.9267 - val_loss: 0.1841
Epoch 6/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9473 - loss: 0.1475 - val_accuracy: 0.9467 - val_loss: 0.1316
Epoch 7/30
[1m19/19[0m [32m━━━━━━━━━━

'Accuracy: 0.9840425531914894'

In [34]:
final_df['success']

0      1.210673e-09
1      5.100537e-10
2      9.249924e-18
3      3.563413e-20
4      1.651953e-19
           ...     
473    2.493636e-22
474    3.318062e-19
475    2.226829e-16
476    5.163840e-25
477    1.539102e-21
Name: success, Length: 478, dtype: float32

In [35]:
df.head()

Unnamed: 0,weighted,attr_A,attr_B,attr_C,attr_E,attr_F,attr_H,attr_G,attr_I,attr_L,...,attr_IN,attr_SC,course_bscs,course_bsit,cfit_A,cfit_AA,cfit_BA,cfit_H,cfit_L,cfit_M
0,0.0,4,4,5,3,5,4,6,5,7,...,4,5,1,0,False,False,False,False,True,False
1,60.000002,5,6,7,1,5,6,5,7,5,...,3,6,1,0,False,True,False,False,False,False
2,57.499999,2,5,6,4,1,2,7,4,7,...,4,7,1,0,True,False,False,False,False,False
3,64.999998,5,6,5,4,7,6,6,7,9,...,4,6,1,0,False,True,False,False,False,False
4,0.0,6,4,3,1,4,2,7,8,5,...,1,6,1,0,False,False,True,False,False,False


In [36]:

from imblearn.over_sampling import SMOTE

In [37]:
# success classifier
from sklearn.ensemble import RandomForestClassifier

success_y = df['weighted'].apply(discretize_weights).apply(lambda x: x==0)
smote = SMOTE()
x_f, y_f = smote.fit_resample(df.drop(columns=['weighted']), success_y)
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(x_f, y_f, test_size=0.2)
from keras.src.optimizers import Adam
from sklearn.metrics import accuracy_score

# Convert data types
X_train_f = np.asarray(X_train_f).astype(np.float64)
y_train_f = np.asarray(y_train_f).astype(np.int16)
X_test_f = np.asarray(X_test_f).astype(np.float64)
y_test_f = np.asarray(y_test_f).astype(np.int16)

# Create the model
model = Sequential([
    InputLayer(shape=(X_train_s.shape[1],)),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')  # Single output unit for binary classification
])

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model to the training data
model.fit(X_train_f, y_train_f, epochs=30, batch_size=32, validation_split=0.2, verbose=1)

# Make predictions
y_pred_prob = model.predict(X_test_f)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to class labels

# Calculate accuracy
accuracy = accuracy_score(y_test_f, y_pred)

f'Accuracy: {accuracy}'

Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4991 - loss: 0.6943 - val_accuracy: 0.4516 - val_loss: 0.6926
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4973 - loss: 0.6920 - val_accuracy: 0.4919 - val_loss: 0.6774
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5986 - loss: 0.6594 - val_accuracy: 0.6935 - val_loss: 0.5972
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6913 - loss: 0.6093 - val_accuracy: 0.6855 - val_loss: 0.6057
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7121 - loss: 0.5606 - val_accuracy: 0.7581 - val_loss: 0.5239
Epoch 6/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7664 - loss: 0.4788 - val_accuracy: 0.7258 - val_loss: 0.5416
Epoch 7/30
[1m16/16[0m [32m━━━━━━━━━━

'Accuracy: 0.8774193548387097'

In [38]:
model.predict(X_test_f)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 511us/step


array([[8.5423923e-01],
       [7.4791950e-01],
       [7.2102457e-02],
       [5.0588393e-01],
       [2.0435487e-01],
       [9.8909944e-01],
       [1.6512875e-01],
       [9.0861762e-01],
       [9.8315251e-01],
       [4.7380516e-01],
       [4.0790934e-02],
       [1.5984176e-02],
       [9.9997431e-01],
       [2.3534641e-01],
       [3.3130821e-02],
       [8.9954859e-01],
       [2.8571910e-01],
       [8.3596945e-01],
       [2.1064974e-01],
       [4.9152400e-02],
       [7.1566072e-05],
       [5.0732934e-01],
       [3.7646151e-01],
       [8.6651576e-01],
       [9.9656636e-01],
       [5.0785784e-02],
       [9.1364908e-01],
       [7.7267343e-01],
       [1.1730728e-03],
       [8.5731989e-01],
       [9.0538979e-01],
       [6.6513062e-01],
       [9.8844671e-01],
       [4.9823260e-01],
       [9.5919144e-01],
       [3.2348987e-01],
       [2.7368451e-02],
       [9.9982673e-01],
       [7.9853777e-03],
       [9.8301989e-01],
       [5.0751655e-03],
       [3.984173

In [39]:
final_df['failure'] = model.predict(final_df_features)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 435us/step


In [40]:
final_df

Unnamed: 0,weighted,attr_A,attr_B,attr_C,attr_E,attr_F,attr_H,attr_G,attr_I,attr_L,...,course_bscs,course_bsit,cfit_A,cfit_AA,cfit_BA,cfit_H,cfit_L,cfit_M,success,failure
0,0.000000,4,4,5,3,5,4,6,5,7,...,1,0,False,False,False,False,True,False,1.210673e-09,0.757611
1,60.000002,5,6,7,1,5,6,5,7,5,...,1,0,False,True,False,False,False,False,5.100537e-10,0.000065
2,57.499999,2,5,6,4,1,2,7,4,7,...,1,0,True,False,False,False,False,False,9.249924e-18,0.012181
3,64.999998,5,6,5,4,7,6,6,7,9,...,1,0,False,True,False,False,False,False,3.563413e-20,0.204300
4,0.000000,6,4,3,1,4,2,7,8,5,...,1,0,False,False,True,False,False,False,1.651953e-19,0.999433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,0.000000,4,4,4,4,3,2,7,7,9,...,0,1,False,False,False,False,True,False,2.493636e-22,0.742487
474,61.811566,5,4,5,9,5,5,6,5,8,...,0,1,False,False,False,True,False,False,3.318062e-19,0.006182
475,56.534958,5,4,6,2,3,5,4,5,7,...,0,1,False,False,False,False,True,False,2.226829e-16,0.045137
476,77.499998,5,3,5,7,4,6,3,3,7,...,0,1,True,False,False,False,False,False,5.163840e-25,0.008712


In [41]:
df_corr = final_df[['success', 'failure']]

In [42]:
# success classifier
from sklearn.ensemble import RandomForestClassifier

success_y = df['weighted'].apply(discretize_weights).apply(lambda x: x == 1)
smote = SMOTE()
x_b, y_b = smote.fit_resample(df.drop(columns=['weighted']), success_y)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(x_b, y_b, test_size=0.2)
from keras.src.optimizers import Adam
from sklearn.metrics import accuracy_score

# Convert data types
X_train_b = np.asarray(X_train_f).astype(np.float64)
y_train_b = np.asarray(y_train_f).astype(np.int16)
X_test_b = np.asarray(X_test_f).astype(np.float64)
y_test_b = np.asarray(y_test_f).astype(np.int16)

# Create the model
model = Sequential([
    InputLayer(shape=(X_train_s.shape[1],)),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')  # Single output unit for binary classification
])

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model to the training data
model.fit(X_train_f, y_train_f, epochs=30, batch_size=32, validation_split=0.2, verbose=1)

# Make predictions
y_pred_prob = model.predict(X_test_f)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to class labels

# Calculate accuracy
accuracy = accuracy_score(y_test_f, y_pred)

f'Accuracy: {accuracy}'

Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5232 - loss: 0.6985 - val_accuracy: 0.4516 - val_loss: 0.7007
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4904 - loss: 0.6867 - val_accuracy: 0.6290 - val_loss: 0.6748
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5333 - loss: 0.6847 - val_accuracy: 0.6532 - val_loss: 0.6547
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6894 - loss: 0.5944 - val_accuracy: 0.7016 - val_loss: 0.5986
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7638 - loss: 0.5078 - val_accuracy: 0.6129 - val_loss: 0.6569
Epoch 6/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7142 - loss: 0.5489 - val_accuracy: 0.6532 - val_loss: 0.6462
Epoch 7/30
[1m16/16[0m [32m━━━━━━━━━━

'Accuracy: 0.8903225806451613'

In [43]:
result = model.predict(final_df_features)
df_corr['pass'] = result

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 454us/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_corr['pass'] = result


In [44]:
df_corr.corr()

Unnamed: 0,success,failure,pass
success,1.0,-0.117463,-0.110536
failure,-0.117463,1.0,0.799235
pass,-0.110536,0.799235,1.0


In [45]:
df_corr

Unnamed: 0,success,failure,pass
0,1.210673e-09,0.757611,9.992988e-01
1,5.100537e-10,0.000065,2.185780e-08
2,9.249924e-18,0.012181,3.329029e-02
3,3.563413e-20,0.204300,1.163027e-02
4,1.651953e-19,0.999433,9.994544e-01
...,...,...,...
473,2.493636e-22,0.742487,9.963731e-01
474,3.318062e-19,0.006182,6.388703e-03
475,2.226829e-16,0.045137,3.432620e-03
476,5.163840e-25,0.008712,1.506115e-02


In [46]:
merged_input_features = pd.concat([df_corr, pd.DataFrame(final_df_features)], axis=1)

In [47]:
merged_input_features

Unnamed: 0,success,failure,pass,0,1,2,3,4,5,6,...,19,20,21,22,23,24,25,26,27,28
0,1.210673e-09,0.757611,9.992988e-01,4.0,4.0,5.0,3.0,5.0,4.0,6.0,...,4.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,5.100537e-10,0.000065,2.185780e-08,5.0,6.0,7.0,1.0,5.0,6.0,5.0,...,3.0,6.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,9.249924e-18,0.012181,3.329029e-02,2.0,5.0,6.0,4.0,1.0,2.0,7.0,...,4.0,7.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3.563413e-20,0.204300,1.163027e-02,5.0,6.0,5.0,4.0,7.0,6.0,6.0,...,4.0,6.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.651953e-19,0.999433,9.994544e-01,6.0,4.0,3.0,1.0,4.0,2.0,7.0,...,1.0,6.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,2.493636e-22,0.742487,9.963731e-01,4.0,4.0,4.0,4.0,3.0,2.0,7.0,...,4.0,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
474,3.318062e-19,0.006182,6.388703e-03,5.0,4.0,5.0,9.0,5.0,5.0,6.0,...,8.0,7.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
475,2.226829e-16,0.045137,3.432620e-03,5.0,4.0,6.0,2.0,3.0,5.0,4.0,...,3.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
476,5.163840e-25,0.008712,1.506115e-02,5.0,3.0,5.0,7.0,4.0,6.0,3.0,...,7.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [48]:
from sklearn.preprocessing import OneHotEncoder

y_one_hot = pd.get_dummies(final_y)
final_model = Sequential([
    InputLayer(shape=(merged_input_features.shape[1],)),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(3, activation='sigmoid')],
)
final_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
final_model.fit(merged_input_features, y_one_hot, epochs=30, batch_size=32, validation_split=0.2, verbose=1)
y_pred_prob = model.predict(X_test_f)

Epoch 1/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4928 - loss: 1.0082 - val_accuracy: 0.8542 - val_loss: 0.5289
Epoch 2/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7551 - loss: 0.7027 - val_accuracy: 0.8542 - val_loss: 0.4721
Epoch 3/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7558 - loss: 0.6531 - val_accuracy: 0.8542 - val_loss: 0.4505
Epoch 4/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7474 - loss: 0.5985 - val_accuracy: 0.8542 - val_loss: 0.4263
Epoch 5/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7810 - loss: 0.5167 - val_accuracy: 0.8854 - val_loss: 0.4340
Epoch 6/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8280 - loss: 0.4540 - val_accuracy: 0.8542 - val_loss: 0.5264
Epoch 7/30
[1m12/12[0m [32m━━━━━━━━━━

In [49]:
final_model.predict(merged_input_features)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


array([[0.96430635, 0.62509644, 0.01957279],
       [0.23072335, 0.9993999 , 0.01820329],
       [0.2668395 , 0.99964947, 0.01078043],
       ...,
       [0.26993436, 0.99980867, 0.00683836],
       [0.42017525, 0.99950963, 0.00650586],
       [0.3345277 , 0.9981437 , 0.02482418]], dtype=float32)

In [50]:
y_one_hot

Unnamed: 0,0,1,2
0,True,False,False
1,False,True,False
2,False,True,False
3,False,True,False
4,True,False,False
...,...,...,...
473,True,False,False
474,False,True,False
475,False,True,False
476,False,True,False


In [51]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

In [72]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

# Assuming final_y and merged_input_features are already defined

# One-hot encode the target variable
y_one_hot = pd.get_dummies(final_y)

# Define the model creation function
def create_model(input_shape):
    model = Sequential([
        InputLayer(shape=(input_shape,)),
        Dense(128, activation='relu'),
        Dense(128, activation='relu'),
        Dense(128, activation='relu'),
        Dense(128, activation='relu'),
        Dense(128, activation='relu'),
        Dense(128, activation='relu'),
        Dense(128, activation='relu'),
        Dense(128, activation='relu'),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Set up k-fold cross-validation
n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform k-fold cross-validation
fold_no = 1
acc_per_fold = []
loss_per_fold = []

for train, test in kfold.split(merged_input_features, y_one_hot):
    model = create_model(merged_input_features.shape[1])
    
    history = model.fit(
        merged_input_features.iloc[train], 
        y_one_hot.iloc[train], 
        epochs=30, 
        batch_size=32, 
        validation_data=(merged_input_features.iloc[test], y_one_hot.iloc[test]),
        verbose=1
    )
    
    scores = model.evaluate(merged_input_features.iloc[test], y_one_hot.iloc[test], verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])
    
    fold_no += 1



# Train the final model on all data
final_model = create_model(merged_input_features.shape[1])
final_model.fit(merged_input_features, y_one_hot, epochs=30, batch_size=32, verbose=1)
# Print average scores
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')



Epoch 1/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6416 - loss: 0.7694 - val_accuracy: 0.7812 - val_loss: 0.6404
Epoch 2/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7849 - loss: 0.6308 - val_accuracy: 0.7812 - val_loss: 0.5808
Epoch 3/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7772 - loss: 0.6095 - val_accuracy: 0.7812 - val_loss: 0.5703
Epoch 4/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7802 - loss: 0.6020 - val_accuracy: 0.7812 - val_loss: 0.5540
Epoch 5/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7785 - loss: 0.5749 - val_accuracy: 0.7812 - val_loss: 0.5621
Epoch 6/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8071 - loss: 0.4612 - val_accuracy: 0.8125 - val_loss: 0.4751
Epoch 7/30
[1m12/12[0m [32m━━━━━━━━━━

In [73]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
import mysql.connector

def run_analysis(tag_ids):
    def get_engine():
        return mysql.connector.connect(
            host='monorail.proxy.rlwy.net',
            port=45826,
            user='root',
            password='VoUeejgBIkMgYiPmYHxMFsIXffwxCKBK',
            database='railway'
        )

    def fetch_data_as_dataframe(connection, query: str) -> pd.DataFrame:
        cursor = connection.cursor()
        cursor.execute(query)
        result_set = cursor.fetchall()
        column_names = cursor.column_names
        cursor.close()
        df = pd.DataFrame(result_set, columns=column_names)
        return df.dropna()

    def discretize_weights(weight):
        if weight < 50:
            return 0
        if 50 <= weight < 95:
            return 1
        if 95 <= weight <= 100:
            return 2

    # Modify the SQL query to use the provided tag_ids
    tag_ids_str = ', '.join(map(str, tag_ids))
    query = f"""
    SELECT ((grade-1)/4)*100 as weighted, attr_A, attr_B, attr_C, attr_E, attr_F, attr_H, attr_G, attr_I, attr_L, attr_M, attr_N, attr_O, attr_Q1, attr_Q2,
            attr_Q3, attr_Q4, attr_EX, attr_AX, attr_TM, attr_IN, attr_SC,
            cfit,
            CASE when course = 'BSCS' then 1 else 0 end as course_bscs,
            CASE when course = 'BSIT' then 1 else 0 end as course_bsit
    FROM students
    INNER JOIN assessments s on s.student_id = students.Id WHERE tagID in ({tag_ids_str});
    """

    df = fetch_data_as_dataframe(get_engine(), query)
    df = pd.get_dummies(df, columns=['cfit'])
    df.dropna(inplace=True)

    final_y = df['weighted'].apply(discretize_weights)
    final_df = df.copy()
    final_df_features = np.asarray(final_df.drop(columns=['weighted']), np.float64)

    def train_binary_classifier(X, y):
        smote = SMOTE()
        X_resampled, y_resampled = smote.fit_resample(X, y)
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

        model = Sequential([
            InputLayer(shape=(X_train.shape[1],)),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dense(1, activation='sigmoid')
        ])

        model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0)
        print('Train model initial')
        return model.predict(final_df_features)

    df_corr = pd.DataFrame()
    df_corr['success'] = train_binary_classifier(df.drop(columns=['weighted']), final_y.apply(lambda x: x == 2))
    df_corr['failure'] = train_binary_classifier(df.drop(columns=['weighted']), final_y.apply(lambda x: x == 0))
    df_corr['pass'] = train_binary_classifier(df.drop(columns=['weighted']), final_y.apply(lambda x: x == 1))

    merged_input_features = pd.concat([df_corr, pd.DataFrame(final_df_features)], axis=1)

    y_one_hot = pd.get_dummies(final_y)

    def create_model(input_shape):
        model = Sequential([
            InputLayer(shape=(input_shape,)),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dense(3, activation='softmax')
        ])
        model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    n_splits = 5
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    acc_per_fold = []
    loss_per_fold = []

    for fold_no, (train, test) in enumerate(kfold.split(merged_input_features, y_one_hot), 1):
        model = create_model(merged_input_features.shape[1])
        
        history = model.fit(
            merged_input_features.iloc[train], 
            y_one_hot.iloc[train], 
            epochs=30, 
            batch_size=32, 
            validation_data=(merged_input_features.iloc[test], y_one_hot.iloc[test]),
            verbose=0
        )
        
        scores = model.evaluate(merged_input_features.iloc[test], y_one_hot.iloc[test], verbose=0)
        print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
        
        acc_per_fold.append(scores[1] * 100)
        loss_per_fold.append(scores[0])

    print('------------------------------------------------------------------------')
    print('Score per fold')
    for i in range(0, len(acc_per_fold)):
        print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
    print('------------------------------------------------------------------------')
    print('Average scores for all folds:')
    print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
    print(f'> Loss: {np.mean(loss_per_fold)}')
    print('------------------------------------------------------------------------')

    final_model = create_model(merged_input_features.shape[1])
    final_model.fit(merged_input_features, y_one_hot, epochs=30, batch_size=32, verbose=0)

    return final_model, merged_input_features

# Example usage:
# model, features = run_analysis([5, 6, 7])  # Replace with your desired tag IDs

In [74]:
merged_input_features

Unnamed: 0,success,failure,pass,0,1,2,3,4,5,6,...,19,20,21,22,23,24,25,26,27,28
0,1.210673e-09,0.757611,9.992988e-01,4.0,4.0,5.0,3.0,5.0,4.0,6.0,...,4.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,5.100537e-10,0.000065,2.185780e-08,5.0,6.0,7.0,1.0,5.0,6.0,5.0,...,3.0,6.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,9.249924e-18,0.012181,3.329029e-02,2.0,5.0,6.0,4.0,1.0,2.0,7.0,...,4.0,7.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3.563413e-20,0.204300,1.163027e-02,5.0,6.0,5.0,4.0,7.0,6.0,6.0,...,4.0,6.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.651953e-19,0.999433,9.994544e-01,6.0,4.0,3.0,1.0,4.0,2.0,7.0,...,1.0,6.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,2.493636e-22,0.742487,9.963731e-01,4.0,4.0,4.0,4.0,3.0,2.0,7.0,...,4.0,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
474,3.318062e-19,0.006182,6.388703e-03,5.0,4.0,5.0,9.0,5.0,5.0,6.0,...,8.0,7.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
475,2.226829e-16,0.045137,3.432620e-03,5.0,4.0,6.0,2.0,3.0,5.0,4.0,...,3.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
476,5.163840e-25,0.008712,1.506115e-02,5.0,3.0,5.0,7.0,4.0,6.0,3.0,...,7.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [75]:
df

Unnamed: 0,weighted,attr_A,attr_B,attr_C,attr_E,attr_F,attr_H,attr_G,attr_I,attr_L,...,attr_IN,attr_SC,course_bscs,course_bsit,cfit_A,cfit_AA,cfit_BA,cfit_H,cfit_L,cfit_M
0,0.000000,4,4,5,3,5,4,6,5,7,...,4,5,1,0,False,False,False,False,True,False
1,60.000002,5,6,7,1,5,6,5,7,5,...,3,6,1,0,False,True,False,False,False,False
2,57.499999,2,5,6,4,1,2,7,4,7,...,4,7,1,0,True,False,False,False,False,False
3,64.999998,5,6,5,4,7,6,6,7,9,...,4,6,1,0,False,True,False,False,False,False
4,0.000000,6,4,3,1,4,2,7,8,5,...,1,6,1,0,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,0.000000,4,4,4,4,3,2,7,7,9,...,4,7,0,1,False,False,False,False,True,False
474,61.811566,5,4,5,9,5,5,6,5,8,...,8,7,0,1,False,False,False,True,False,False
475,56.534958,5,4,6,2,3,5,4,5,7,...,3,5,0,1,False,False,False,False,True,False
476,77.499998,5,3,5,7,4,6,3,3,7,...,7,5,0,1,True,False,False,False,False,False


In [76]:
y_one_hot

Unnamed: 0,0,1,2
0,True,False,False
1,False,True,False
2,False,True,False
3,False,True,False
4,True,False,False
...,...,...,...
473,True,False,False
474,False,True,False
475,False,True,False
476,False,True,False


In [77]:
final_model.predict(merged_input_features)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


array([[9.5701861e-01, 4.2186756e-02, 7.9461688e-04],
       [3.4556751e-07, 9.9999964e-01, 1.9087590e-08],
       [2.3443113e-07, 9.9999976e-01, 5.2051754e-09],
       ...,
       [6.1188825e-06, 9.9999356e-01, 3.8790984e-07],
       [1.9204290e-06, 9.9999809e-01, 4.3936126e-08],
       [1.0541964e-01, 8.9349461e-01, 1.0858263e-03]], dtype=float32)