In [159]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate


In [160]:
# Load the dataset
file_path = r"C:\Users\psoha\Downloads\cleaned_and_transformed.csv"
data = pd.read_csv(file_path)

data


Unnamed: 0,School Name,City,AP Classes?,Dual Enrollment?,Offers Electives?,Math Score,English Score,National Rank,AZ Rank,Racial%-White,Racial%-Black,Racial%-Native,Racial%-Hispanic,Racial%-Asian,Racial%-Other,school_type
0,Arizona Lutheran Academy,Phoenix,Yes,Yes,Yes,28,28,3002,25,57.0,6.0,10.0,15.0,12.0,0,Public
1,Arizona Preparatory Academy,Phoenix,Yes,Yes,Yes,17,22,19568,310,12.0,11.0,1.0,71.0,1.0,2,Public
2,Arizona School For The Arts,Phoenix,Yes,Yes,Yes,27,30,481,145,63.0,4.0,1.0,22.0,5.0,5,Public
3,Arizona Virtual Academy,Phoenix,Yes,Yes,Yes,19,32,3635,980,69.0,6.0,2.0,15.0,1.0,6,Public
4,Mesquite High School,Gilbert,Yes,Yes,Yes,80,75,1841,58,60.0,20.0,10.0,10.0,5.0,5,Private
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,Franklin Police and Fire High School,Phoenix,Yes,Yes,Yes,43,65,4136,65,3.1,0.7,1.7,93.4,1.0,0,Private
200,Mountain Ridge High School,Glendale,yes,yes,yes,51,55,2046,28,69.0,2.0,1.0,13.0,10.0,5,Public
201,Raymond S. Kellis,Glendale,yes,yes,yes,48,43,7392,126,24.0,8.0,1.0,57.0,5.0,5,Private
202,Sandra Day O'Connor High School,Glendale,yes,yes,yes,70,69,2622,41,74.0,2.0,1.0,14.0,5.0,4,Private


In [161]:
# Convert 'Math Score' to numeric type, replacing errors with NaN
data['Math Score'] = pd.to_numeric(data['Math Score'], errors='coerce')

# Define performance levels based on Math Score
data['Performance_Level'] = 'Low Performance'
data.loc[data['Math Score'] > 80, 'Performance_Level'] = 'High Performance'

data.drop(['School Name', 'City'], axis=1, inplace=True)

In [162]:
data


Unnamed: 0,AP Classes?,Dual Enrollment?,Offers Electives?,Math Score,English Score,National Rank,AZ Rank,Racial%-White,Racial%-Black,Racial%-Native,Racial%-Hispanic,Racial%-Asian,Racial%-Other,school_type,Performance_Level
0,Yes,Yes,Yes,28.0,28,3002,25,57.0,6.0,10.0,15.0,12.0,0,Public,Low Performance
1,Yes,Yes,Yes,17.0,22,19568,310,12.0,11.0,1.0,71.0,1.0,2,Public,Low Performance
2,Yes,Yes,Yes,27.0,30,481,145,63.0,4.0,1.0,22.0,5.0,5,Public,Low Performance
3,Yes,Yes,Yes,19.0,32,3635,980,69.0,6.0,2.0,15.0,1.0,6,Public,Low Performance
4,Yes,Yes,Yes,80.0,75,1841,58,60.0,20.0,10.0,10.0,5.0,5,Private,Low Performance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,Yes,Yes,Yes,43.0,65,4136,65,3.1,0.7,1.7,93.4,1.0,0,Private,Low Performance
200,yes,yes,yes,51.0,55,2046,28,69.0,2.0,1.0,13.0,10.0,5,Public,Low Performance
201,yes,yes,yes,48.0,43,7392,126,24.0,8.0,1.0,57.0,5.0,5,Private,Low Performance
202,yes,yes,yes,70.0,69,2622,41,74.0,2.0,1.0,14.0,5.0,4,Private,Low Performance


In [163]:
# Convert 'Yes' and 'No' values to 1 and 0
binary_columns = ['AP Classes?', 'Dual Enrollment?', 'Offers Electives?']
for col in binary_columns:
    data[col] = (data[col] == 'Yes').astype(int)

print("Converted binary columns successfully!")
data[binary_columns].head()


Converted binary columns successfully!


Unnamed: 0,AP Classes?,Dual Enrollment?,Offers Electives?
0,1,1,1
1,1,1,1
2,1,1,1
3,1,1,1
4,1,1,1


In [164]:
# Apply one-hot encoding to categorical variables
data = pd.get_dummies(data, columns=['school_type'])

print("Applied one-hot encoding.")
data.head()


Applied one-hot encoding.


Unnamed: 0,AP Classes?,Dual Enrollment?,Offers Electives?,Math Score,English Score,National Rank,AZ Rank,Racial%-White,Racial%-Black,Racial%-Native,Racial%-Hispanic,Racial%-Asian,Racial%-Other,Performance_Level,school_type_Private,school_type_Public
0,1,1,1,28.0,28,3002,25,57.0,6.0,10.0,15.0,12.0,0,Low Performance,False,True
1,1,1,1,17.0,22,19568,310,12.0,11.0,1.0,71.0,1.0,2,Low Performance,False,True
2,1,1,1,27.0,30,481,145,63.0,4.0,1.0,22.0,5.0,5,Low Performance,False,True
3,1,1,1,19.0,32,3635,980,69.0,6.0,2.0,15.0,1.0,6,Low Performance,False,True
4,1,1,1,80.0,75,1841,58,60.0,20.0,10.0,10.0,5.0,5,Low Performance,True,False


In [165]:
# Define numerical features
num_features = ['Math Score', 'English Score', 'National Rank', 'AZ Rank', 
                'Racial%-White', 'Racial%-Black', 'Racial%-Native', 
                'Racial%-Hispanic', 'Racial%-Asian', 'Racial%-Other']

# Convert to numeric (if needed) and handle errors
for col in num_features:
    data[col] = pd.to_numeric(data[col], errors='coerce')  # Converts invalid values to NaN
    data[col].fillna(data[col].median(), inplace=True)  # Fill NaN with column median

# Apply StandardScaler
scaler = StandardScaler()
data[num_features] = scaler.fit_transform(data[num_features])



In [166]:
# Define features and target variable
X = data.drop('Performance_Level', axis=1)
y = data['Performance_Level']

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (204, 15)
Shape of y: (204,)


In [170]:
# Define number of folds for Stratified K-Fold Cross-Validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}


In [171]:
for clf_name, clf in classifiers.items():
    accuracies, precisions, recalls, f1_scores = [], [], [], []
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        clf.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = clf.predict(X_test)
        
        # Calculate evaluation metrics
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, average='weighted'))
        recalls.append(recall_score(y_test, y_pred, average='weighted'))
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    # Store average performance metrics
    evaluation_metrics['Model'].append(clf_name)
    evaluation_metrics['Accuracy'].append(np.mean(accuracies))
    evaluation_metrics['Precision'].append(np.mean(precisions))
    evaluation_metrics['Recall'].append(np.mean(recalls))
    evaluation_metrics['F1-score'].append(np.mean(f1_scores))

In [172]:
# Convert metrics dictionary to a DataFrame
df_results = pd.DataFrame(evaluation_metrics)

# Print formatted table of model performance
print("\nModel Performance Comparison:\n")
print(tabulate(df_results, headers='keys', tablefmt='fancy_grid'))


Model Performance Comparison:

╒════╤═════════════════════╤════════════╤═════════════╤══════════╤════════════╕
│    │ Model               │   Accuracy │   Precision │   Recall │   F1-score │
╞════╪═════════════════════╪════════════╪═════════════╪══════════╪════════════╡
│  0 │ Logistic Regression │   0.941098 │    0.94262  │ 0.941098 │   0.935416 │
├────┼─────────────────────┼────────────┼─────────────┼──────────┼────────────┤
│  1 │ Decision Tree       │   1        │    1        │ 1        │   1        │
├────┼─────────────────────┼────────────┼─────────────┼──────────┼────────────┤
│  2 │ Random Forest       │   1        │    1        │ 1        │   1        │
├────┼─────────────────────┼────────────┼─────────────┼──────────┼────────────┤
│  3 │ Gradient Boosting   │   1        │    1        │ 1        │   1        │
├────┼─────────────────────┼────────────┼─────────────┼──────────┼────────────┤
│  4 │ SVM                 │   0.936341 │    0.935047 │ 0.936341 │   0.922311 │
├────┼──