In [133]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

repo_path = "/content/cit19_project"

if os.path.exists(repo_path): #check if repo_path exists. If it does, then it pulls the main branch from our git repo.
    %cd {repo_path}
    !git pull origin main
else:
    %cd /content
    !git clone https://github.com/XebastianePitogo/cit19_project.git
    %cd cit19_project #If none, colab will git clone the repo in path </content> and change directory inside repo.

print("\ncheck ls:")
!ls #list all folder and files inside repo to confirm it exists.

/content/cit19_project
From https://github.com/XebastianePitogo/cit19_project
 * branch            main       -> FETCH_HEAD
Already up to date.

check ls:
 preprocessed_df.csv   README.md  'RSTUDIO_CC19 RESEARCH'   Student-Employability-Datasets.csv


In [134]:
#%cd 'RSTUDIO_CC19 RESEARCH/'
df = pd.read_csv("preprocessed_df.csv")

In [135]:
df.head()

Unnamed: 0.1,Unnamed: 0,GENERAL.APPEARANCE,MANNER.OF.SPEAKING,PHYSICAL.CONDITION,MENTAL.ALERTNESS,SELF.CONFIDENCE,ABILITY.TO.PRESENT.IDEAS,COMMUNICATION.SKILLS,CLASS_factor,CLASS_encoded.CLASS_factorEmployable,CLASS_encoded.CLASS_factorLessEmployable
0,1,4,5,4,5,5,5,5,Employable,1,0
1,2,4,4,4,4,4,4,3,Employable,1,0
2,3,4,3,3,3,3,3,2,LessEmployable,0,1
3,4,3,3,3,2,3,3,3,LessEmployable,0,1
4,5,4,4,3,3,4,4,3,Employable,1,0


In [136]:
# Step 1: Model Selection

# Objective: Based on the nature of the data and the problem type (e.g., classification, or regression), choose the best model for the problem.

# Tasks:

# 1. Understand the Problem
# The task to be done is a classification problem as it predicts whether the student is employable or less employable (<*CLASS> is target variable, separated to factor and encoded for different models.)

#         We will utilize classification metrics for classification as it is more appropriate in our model.
#         Classification: Accuracy, Precision, Recall, F1-score
#
# 2. Choose Candidate Models
#     Consider the dataset size, feature complexity, and interpretability.

#     Classification:
#         Start with Logistic Regression or Decision Trees for interpretability.
#         If these models underperform, test Random Forest, Support Vector Machines (SVM), or k-NN.

# 3. Consider Model Complexity

# Objective: Decide whether to use a simple model (for interpret`ability and efficiency) or a complex model (for better accuracy on complex datasets).
#
#     We will use simple models first then follow up with more complex if the simple models underperform.

#     Compare bias-variance tradeoff:
#         Simple models (Logistic Regression, Decision Trees) are fast but may underfit.
#         Complex models (Random Forest, XGBoost, Neural Networks) capture more details but may overfit.
#         We will check evaluation metrics of each model to check which one is the most appropriate
#         Our dataset is small and structured, a Decision Tree might be sufficient.
#         We will use Random Forest or XGBoost if the simpler model underperforms or underfit.

In [137]:
# Step 2: Model Training

# Objective: Train the model(s) on the prepared dataset and tune their hyperparameters (if applicable).

# Tasks:

#     Train-Test Split:
#         Split the data into training and testing sets. Typically, the split ratio is 80/20 or 70/30 (training/testing).

# Define features (exclude the target variables which are *CLASSES)
X = df.iloc[:, :-3]

# Define target (assuming CLASS_factor is the main target)
#y = df["CLASS_factor"]

y = df.iloc[:, -2:].values #select one hot encoded target variable column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#     Train the Model:
#         Train the selected model on the training data.

## Using Logistric Regression
y_Logistic = np.argmax(y, axis=1) # Y needs to be in categorical integer instead of encoded, so convert it back first.
X_train, X_test, y_train, y_test = train_test_split(X, y_Logistic, test_size=0.2, random_state=42, stratify=y_Logistic)

In [138]:
LRmodel = LogisticRegression(max_iter=500, solver="lbfgs")
LRmodel.fit(X_train, y_train)

y_pred = LRmodel.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.2f}")

Logistic Regression Accuracy: 0.60


In [139]:
CLFmodel = DecisionTreeClassifier()
CLFmodel.fit(X_train,y_train)

y_pred = CLFmodel.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.2f}")

Logistic Regression Accuracy: 0.79


In [140]:
# We firstly used the simple models as our dataset has few features, small dataset size
# and has meaningful output based from our input for interpretability,
# in decision trees(all likert scaled features will eventually determine if a student is employable or less employable)

# Only the Logistic regression underperforms from using both simple models
# as the industry standard typically ranges from 70% to 90%.

In [141]:
#     Hyperparameter Tuning (if applicable):
#         Use GridSearchCV or RandomizedSearchCV for tuning hyperparameters.

In [142]:
!ls

 preprocessed_df.csv   README.md  'RSTUDIO_CC19 RESEARCH'   Student-Employability-Datasets.csv
