In [None]:
!pip install matplotlib_venn 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, learning_curve
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, precision_score
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib_venn as venn
from itertools import combinations


# Load Data 
df = pd.read_csv("/Users/pavitraaritas/Documents/PavitraAritasRaghunathSharma-CS438/Data/Data_CourseCompatibility.csv")

display(df.head())
df.info()

In [None]:
# Class distribution for Suitable_1 (BP course)
print("\nClass distribution for Suitable_1 (BP course):")
print(df['Suitable_1'].value_counts())

In [None]:
# Clean & Preprocess Data

# Define feature and target columns for BP course (Suitable_1)
categorical_cols = ['Semester', 'LearningStyle', 'CareerTrack', 'Reason_1']
numerical_cols = ['OverallCourseRating', 'InterestLevel_1']
target_col = 'Suitable_1'

def convert_study_hours(hours):
    if pd.isna(hours):
        return np.nan
    hours = str(hours).lower().strip()
    if 'hours' in hours:
        hours = hours.replace('hours', '').strip()
    if '-' in hours:
        low, high = hours.split('-')
        try:
            return (float(low) + float(high)) / 2
        except:
            return np.nan
    if '+' in hours:
        return float(hours.replace('+', ''))
    if '<' in hours:
        return float(hours.replace('<', '')) - 1
    try:
        return float(hours)
    except:
        return np.nan

df['StudyHours'] = df['StudyHours'].apply(convert_study_hours)
numerical_cols.append('StudyHours')

# Handle Missing Values 
df = df.dropna(subset=[target_col])


for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())

print("\nUnique values in Suitable_1 before mapping:")
print(df[target_col].unique())

df[target_col] = df[target_col].map({'Yes': 1, 'No': 0})
df = df.dropna(subset=[target_col])

df_encoded = pd.get_dummies(df[categorical_cols], drop_first=True)

X = pd.concat([df[numerical_cols], df_encoded], axis=1)
y = df[target_col]

X['StudyHours'] = X['StudyHours'].clip(upper=50)

print("\nMissing values in X after preprocessing:")
print(X.isna().sum())
print("\nPreprocessed Features (first 5 rows):")
display(X.head())
print("\nTarget (first 5 rows):")
print(y.head())


In [None]:
# Data Splitting for Tuning

# Split into train (60%), validation (20%), and test (20%) sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

print(f"\nTraining set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

In [None]:
# Hyperparameter Tuning

# Tune Logistic Regression 
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_param_grid, cv=3, scoring='accuracy')
lr_grid.fit(X_train, y_train)

print("Best parameters for Logistic Regression:", lr_grid.best_params_)
print("Best validation accuracy:", lr_grid.best_score_)

In [None]:
# Plot validation accuracy for different C values
lr_results = pd.DataFrame(lr_grid.cv_results_)
plt.figure(figsize=(8, 5))
for penalty in ['l1', 'l2']:
    mask = lr_results['param_penalty'] == penalty
    plt.plot(lr_results[mask]['param_C'], lr_results[mask]['mean_test_score'], label=f'Penalty: {penalty}')
plt.xscale('log')
plt.xlabel('C (Inverse of Regularization Strength)')
plt.ylabel('Validation Accuracy')
plt.title('Logistic Regression Hyperparameter Tuning')
plt.legend()
plt.show()

In [None]:
# Tune Decision Tree 
dt_param_grid = {
    'max_depth': [2, 3, 4, 5, 6],
    'min_samples_split': [2, 5, 10]
}
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=3, scoring='accuracy')
dt_grid.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for Decision Tree:", dt_grid.best_params_)
print("Best validation accuracy:", dt_grid.best_score_)

# Plot validation accuracy for different max_depth
dt_results = pd.DataFrame(dt_grid.cv_results_)
plt.figure(figsize=(8, 5))
for min_samples in [2, 5, 10]:
    mask = dt_results['param_min_samples_split'] == min_samples
    plt.plot(dt_results[mask]['param_max_depth'], dt_results[mask]['mean_test_score'], label=f'min_samples_split: {min_samples}')
plt.xlabel('Max Depth')
plt.ylabel('Validation Accuracy')
plt.title('Decision Tree Hyperparameter Tuning')
plt.legend()
plt.show()


In [None]:
# Progressive Modeling with Tuned Models
results = {}
models = {}

# Logistic Regression (Tuned) 
print("\n=== Stage 1: Tuned Logistic Regression ===")
model_lr = LogisticRegression(**lr_grid.best_params_, max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_val)

accuracy_lr = accuracy_score(y_val, y_pred_lr)
results['Logistic Regression (Tuned)'] = accuracy_lr
models['Logistic Regression'] = model_lr
print(f"Validation Accuracy: {accuracy_lr:.3f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred_lr))

plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_val, y_pred_lr), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Tuned Logistic Regression)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:

# Logistic Regression with K-Fold Cross-Validation 
print("\n=== Stage 2: Logistic Regression with K-Fold Cross-Validation ===")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_lr_kfold = cross_val_score(model_lr, X_temp, y_temp, cv=skf, scoring='accuracy')
accuracy_lr_kfold = np.mean(scores_lr_kfold)
results['Logistic Regression (K-Fold)'] = accuracy_lr_kfold
print(f"Average Accuracy (5-Fold CV): {accuracy_lr_kfold:.3f}")
print(f"Standard Deviation: {np.std(scores_lr_kfold):.3f}")

In [None]:

# Decision Tree - Tuned
print("\n=== Stage 3: Tuned Decision Tree ===")
model_dt = DecisionTreeClassifier(**dt_grid.best_params_, random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_val)

accuracy_dt = accuracy_score(y_val, y_pred_dt)
results['Decision Tree (Tuned)'] = accuracy_dt
models['Decision Tree'] = model_dt
print(f"Validation Accuracy: {accuracy_dt:.3f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred_dt))


# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_val, y_pred_dt), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Tuned Decision Tree)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Compare Results for Validation Set
for model, acc in results.items():
    print(f"{model}: {acc:.3f}")

plt.figure(figsize=(8, 5))
plt.bar(results.keys(), results.values())
plt.title("Model Accuracy Comparison (Validation Set)")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Feature Importance

print("\nFeature Importance (Logistic Regression):")
importances_lr = pd.Series(model_lr.coef_[0], index=X.columns)
importances_lr.sort_values().plot(kind='barh', figsize=(10, 6))
plt.title("Feature Importance (Logistic Regression)")
plt.show()

print("\nFeature Importance (Decision Tree):")
importances_dt = pd.Series(model_dt.feature_importances_, index=X.columns)
importances_dt.sort_values().plot(kind='barh', figsize=(10, 6))
plt.title("Feature Importance (Decision Tree)")
plt.show()

In [None]:
# Alternative 1: Logistic Regression with Polynomial Features

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_train[numerical_cols])
X_poly_val = poly.transform(X_val[numerical_cols])
X_poly_test = poly.transform(X_test[numerical_cols])

X_poly_train = pd.concat([pd.DataFrame(X_poly, columns=[f"poly_{i}" for i in range(X_poly.shape[1])], index=X_train.index), X_train.drop(columns=numerical_cols)], axis=1)
X_poly_val = pd.concat([pd.DataFrame(X_poly_val, columns=[f"poly_{i}" for i in range(X_poly_val.shape[1])], index=X_val.index), X_val.drop(columns=numerical_cols)], axis=1)
X_poly_test = pd.concat([pd.DataFrame(X_poly_test, columns=[f"poly_{i}" for i in range(X_poly_test.shape[1])], index=X_test.index), X_test.drop(columns=numerical_cols)], axis=1)

lr_poly_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_param_grid, cv=3, scoring='accuracy')
lr_poly_grid.fit(X_poly_train, y_train)

model_lr_poly = LogisticRegression(**lr_poly_grid.best_params_, max_iter=1000)
model_lr_poly.fit(X_poly_train, y_train)
y_pred_lr_poly = model_lr_poly.predict(X_poly_val)
accuracy_lr_poly = accuracy_score(y_val, y_pred_lr_poly)
results['Logistic Regression (Poly)'] = accuracy_lr_poly
models['Logistic Regression (Poly)'] = model_lr_poly
print(f"Validation Accuracy: {accuracy_lr_poly:.3f}")
print("Best parameters:", lr_poly_grid.best_params_)

In [None]:
# Alternative 2: Logistic Regression with Reduced Features

top_features = importances_lr.abs().sort_values(ascending=False).head(5).index
X_reduced_train = X_train[top_features]
X_reduced_val = X_val[top_features]
X_reduced_test = X_test[top_features]

lr_reduced_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_param_grid, cv=3, scoring='accuracy')
lr_reduced_grid.fit(X_reduced_train, y_train)

model_lr_reduced = LogisticRegression(**lr_reduced_grid.best_params_, max_iter=1000)
model_lr_reduced.fit(X_reduced_train, y_train)
y_pred_lr_reduced = model_lr_reduced.predict(X_reduced_val)
accuracy_lr_reduced = accuracy_score(y_val, y_pred_lr_reduced)
results['Logistic Regression (Reduced)'] = accuracy_lr_reduced
models['Logistic Regression (Reduced)'] = model_lr_reduced
print(f"Validation Accuracy: {accuracy_lr_reduced:.3f}")
print("Best parameters:", lr_reduced_grid.best_params_)
print("Selected features:", top_features.tolist())

In [None]:
# Alternative 3: Random Forest

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=3, scoring='accuracy')
rf_grid.fit(X_train, y_train)

model_rf = RandomForestClassifier(**rf_grid.best_params_, random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_val)
accuracy_rf = accuracy_score(y_val, y_pred_rf)
results['Random Forest'] = accuracy_rf
models['Random Forest'] = model_rf
print(f"Validation Accuracy: {accuracy_rf:.3f}")
print("Best parameters:", rf_grid.best_params_)

In [None]:
# Plot learning curves 
def plot_learning_curve(estimator, title, X, y, cv=None, train_sizes=np.linspace(0.1, 1.0, 5)):
    train_sizes, train_scores, val_scores = learning_curve(estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='accuracy')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(8, 5))
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Validation score")
    plt.title(title)
    plt.xlabel("Training Examples")
    plt.ylabel("Accuracy")
    plt.legend(loc="best")
    plt.show()
    
    # Analyze fit
    if train_scores_mean[-1] > val_scores_mean[-1] + 0.1 and train_scores_mean[-1] > 0.9:
        fit_status = "Overfitting: High training accuracy, low validation accuracy."
    elif train_scores_mean[-1] < 0.7 and val_scores_mean[-1] < 0.7:
        fit_status = "Underfitting: Both training and validation accuracies are low."
    else:
        fit_status = "Good fit: Training and validation accuracies are close and reasonable."
    print(f"Fit Analysis - {title}: {fit_status}")
    
    
    if val_scores_mean[-1] < 0.8 and (val_scores_mean[-1] - val_scores_mean[0]) > 0.05:
        data_need = "More training data may help: Validation accuracy is improving with more data."
    else:
        data_need = "More data may not significantly help: Validation accuracy has plateaued."
    print(f"Data Need - {title}: {data_need}")


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
plot_learning_curve(model_lr, "Learning Curve (Logistic Regression)", X_temp, y_temp, cv=cv)
plot_learning_curve(model_dt, "Learning Curve (Decision Tree)", X_temp, y_temp, cv=cv)
plot_learning_curve(model_lr_poly, "Learning Curve (Logistic Regression - Poly)", X_poly_train, y_train, cv=cv)
plot_learning_curve(model_lr_reduced, "Learning Curve (Logistic Regression - Reduced)", X_reduced_train, y_train, cv=cv)
plot_learning_curve(model_rf, "Learning Curve (Random Forest)", X_temp, y_temp, cv=cv)


In [None]:
# ROC Curves and AUC 
plt.figure(figsize=(10, 6))
for name, model in models.items():
    if name == 'Logistic Regression (Poly)':
        X_input = X_poly_test
    elif name == 'Logistic Regression (Reduced)':
        X_input = X_reduced_test
    else:
        X_input = X_test
    
    y_pred_proba = model.predict_proba(X_input)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Models')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Precision, Recall, and F1 on Test Set
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print("\n=== Precision, Recall, and F1 on Test Set ===")
for name, model in models.items():
    if name == 'Logistic Regression (Poly)':
        X_input = X_poly_test
    elif name == 'Logistic Regression (Reduced)':
        X_input = X_reduced_test
    else:
        X_input = X_test
    
    y_pred = model.predict(X_input)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    print(f"{name}: Precision = {precision:.3f}, Recall = {recall:.3f}, F1 = {f1:.3f}")

In [None]:
# Compare Errors with Venn Diagram 
error_indices = {}
for name, model in models.items():
    if name == 'Logistic Regression (Poly)':
        X_input = X_poly_test
    elif name == 'Logistic Regression (Reduced)':
        X_input = X_reduced_test
    else:
        X_input = X_test
    
    y_pred = model.predict(X_input)
    errors = set(X_test.index[y_test != y_pred])
    error_indices[name] = errors

# Venn Diagram for three models (Logistic Regression, Decision Tree, Random Forest)
plt.figure(figsize=(8, 6))
venn.venn3([error_indices['Logistic Regression'], error_indices['Decision Tree'], error_indices['Random Forest']],
           set_labels=('Logistic Regression', 'Decision Tree', 'Random Forest'))
plt.title("Venn Diagram of Misclassification Errors")
plt.show()


In [None]:
# Analyse Sample Errors 
print("\n=== Sample Misclassification Analysis ===")
# Select two common errors between Logistic Regression and Decision Tree
common_errors_set = error_indices['Logistic Regression'].intersection(error_indices['Decision Tree'])
common_errors = list(common_errors_set)[:2]
for idx in common_errors:
    print(f"\nMisclassified Student (Index {idx}):")
    print("Features:", X_test.loc[idx].to_dict())
    print("True Label:", y_test.loc[idx])
    print("Logistic Regression Prediction:", models['Logistic Regression'].predict(X_test.loc[[idx]])[0])
    print("Decision Tree Prediction:", models['Decision Tree'].predict(X_test.loc[[idx]])[0])
    print("Analysis: This student may have been misclassified due to low InterestLevel_1 or a mismatch between CareerTrack and course requirements.")


# Recommender System 

In [None]:
# Target columns for all courses
course_columns = [
    {'target': 'Suitable_1', 'interest': 'InterestLevel_1', 'reason': 'Reason_1', 'course': 'BP'},
    {'target': 'Suitable_2', 'interest': 'InterestLevel_2', 'reason': 'Reason_2', 'course': 'BI'},
    {'target': 'Suitable_3', 'interest': 'InterestLevel_3', 'reason': 'Reason_3', 'course': 'BDCN'},
    {'target': 'Suitable_4', 'interest': 'InterestLevel_4', 'reason': 'Reason_4', 'course': 'PCM'},
    {'target': 'Suitable_5', 'interest': 'InterestLevel_5', 'reason': 'Reason_5', 'course': 'ADML'}
]

for course in course_columns:
    target_col = course['target']
    course_name = course['course']
    print(f"\n{course_name} ({target_col}):")
    print(f"Missing values: {df[target_col].isna().sum()} out of {len(df)} ({df[target_col].isna().mean() * 100:.1f}%)")
    print(f"Unique values: {df[target_col].unique()}")

In [None]:
# Mapping of courses to their ideal CareerTrack
course_to_career_mapping = {
    'BP': 'Software Development',
    'BI': 'Business Analytics',
    'BDCN': 'Cybersecurity',
    'PCM': 'Project Management / Business Analysis',
    'ADML': 'Data Science / AI'
}

suitability_predictions = pd.DataFrame(index=df.index)

for course in course_columns:
    target_col = course['target']
    interest_col = course['interest']
    reason_col = course['reason']
    course_name = course['course']
    
    if df[target_col].isna().sum() > len(df) * 0.8:
        print(f"Skipping {course_name}: Too many missing values in {target_col}")
        suitability_predictions[course_name] = 0  
        continue
    
    
    temp_df = df.copy()
    temp_df[target_col] = temp_df[target_col].fillna('No')
    
    temp_categorical_cols = ['Semester', 'LearningStyle', 'CareerTrack', reason_col]
    temp_numerical_cols = ['OverallCourseRating', interest_col, 'StudyHours']
    
    for col in temp_categorical_cols:
        temp_df[col] = temp_df[col].fillna(temp_df[col].mode()[0])
    for col in temp_numerical_cols:
        temp_df[col] = temp_df[col].fillna(temp_df[col].mean())
    
    print(f"\nUnique values in {target_col} for {course_name}:")
    print(temp_df[target_col].unique())
    
    if temp_df[target_col].dtype in ['int64', 'int32', 'float64']:
        if set(temp_df[target_col].unique()).issubset({0, 1}):
            print(f"{target_col} already contains binary values [0, 1]. Skipping mapping.")
        else:
            print(f"Unexpected integer values in {target_col}: {temp_df[target_col].unique()}")
            suitability_predictions[course_name] = 0
            continue
    else:
        temp_df[target_col] = temp_df[target_col].map({'Yes': 1, 'No': 0})
    
    temp_df = temp_df.dropna(subset=[target_col])
    
    if temp_df.empty:
        print(f"Skipping {course_name}: No valid target values after mapping")
        suitability_predictions[course_name] = 0  
        continue
    
    temp_encoded = pd.get_dummies(temp_df[temp_categorical_cols], drop_first=True)
    temp_X = pd.concat([temp_df[temp_numerical_cols], temp_encoded], axis=1)
    temp_y = temp_df[target_col]
    
    if temp_y.isna().sum() > 0:
        print(f"Error: NaN values found in target for {course_name}")
        suitability_predictions[course_name] = 0  
        continue
    
    temp_model = LogisticRegression(max_iter=1000, class_weight='balanced')
    temp_model.fit(temp_X, temp_y)
    
    full_encoded = pd.get_dummies(df[temp_categorical_cols], drop_first=True)
    for col in temp_X.columns:
        if col not in full_encoded.columns and col not in temp_numerical_cols:
            full_encoded[col] = 0
    full_encoded = full_encoded[[col for col in temp_X.columns if col not in temp_numerical_cols]]
    full_X = pd.concat([df[temp_numerical_cols], full_encoded], axis=1)
    full_X = full_X.fillna(full_X.mean())
    full_X = full_X[temp_X.columns]
    
    full_X_reset = full_X.reset_index(drop=True)
    
    suitability_probs = temp_model.predict_proba(full_X_reset)[:, 1]
    
    ideal_career = course_to_career_mapping[course_name]
    career_cols = [col for col in temp_X.columns if col.startswith('CareerTrack_')]
    
    for idx in range(len(suitability_probs)):
        for career_col in career_cols:
            if full_X_reset.loc[idx, career_col] == 1:
                career = career_col.replace('CareerTrack_', '')
                if career == ideal_career:
                    suitability_probs[idx] *= 2.0  
                elif career == 'Data Science / AI' and career != ideal_career:
                    suitability_probs[idx] *= 0.5 
    
    
    suitability_probs_df = pd.Series(suitability_probs, index=full_X.index)
    
    suitability_predictions[course_name] = (suitability_probs_df >= 0.5).astype(int)

print("\nSuitability Predictions for All Courses (first 5 rows):")
display(suitability_predictions.head())

In [None]:
# Summarize Course Profiles

course_to_career_mapping = {
    'BP': 'Software Development',
    'BI': 'Business Analytics',
    'BDCN': 'Cybersecurity',
    'PCM': 'Project Management / Business Analysis',
    'ADML': 'Data Science / AI'
}

course_profiles_summary = {}

# Grid for all CareerTrack distribution plots 
num_courses = len(suitability_predictions.columns)
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(20, 18))  
axes = axes.flatten()  

for idx, course in enumerate(suitability_predictions.columns):
    suitable_students = suitability_predictions[course] == 1
    if suitable_students.sum() == 0:
        print(f"No students found suitable for {course}")
        course_profiles_summary[course] = "No suitable students"
        axes[idx].text(0.5, 0.5, f"No suitable students for {course}", 
                       horizontalalignment='center', verticalalignment='center', fontsize=12)
        axes[idx].set_title(f"CareerTrack Distribution for {course}", fontsize=14, pad=15)
        continue
    
    suitable_df = df[suitable_students][['CareerTrack', 'LearningStyle']]
    
    career_counts = suitable_df['CareerTrack'].value_counts(normalize=True) * 100
    career_counts = career_counts.head(5)  
    
    ideal_career = course_to_career_mapping[course]
    if ideal_career in career_counts.index:
        ideal_career_pct = career_counts[ideal_career]
    else:
        ideal_career_pct = 0.0
    
    learning_counts = suitable_df['LearningStyle'].value_counts(normalize=True) * 100
    top_learning = learning_counts.index[0]
    top_learning_pct = learning_counts.iloc[0]
    
    summary = f"{course} is ideal for {ideal_career} ({ideal_career_pct:.1f}%) learners. Top CareerTracks: "
    for career, pct in career_counts.head(2).items():  
        summary += f"{career} ({pct:.1f}%), "
    summary = summary.rstrip(", ") + f". Top LearningStyle: {top_learning} ({top_learning_pct:.1f}%)."
    course_profiles_summary[course] = summary
    print(summary)
    
    bars = axes[idx].bar(career_counts.index, career_counts.values, color='skyblue', edgecolor='black')
    
    for bar in bars:
        height = bar.get_height()
        axes[idx].text(bar.get_x() + bar.get_width() / 2, height + 1, f'{height:.1f}%', 
                       ha='center', va='bottom', fontsize=10)
    
    axes[idx].set_title(f"CareerTrack Distribution for {course}", fontsize=14, pad=15)
    axes[idx].set_xlabel("CareerTrack", fontsize=12)
    axes[idx].set_ylabel("Percentage", fontsize=12)
    axes[idx].set_ylim(0, max(career_counts.values) + 10)  
    axes[idx].tick_params(axis='x', rotation=45, labelsize=10)
    
    axes[idx].margins(x=0.05)  
    plt.setp(axes[idx].get_xticklabels(), ha='right')  

if idx < len(axes) - 1:
    axes[-1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# CareerTrack-Focused Recommender System

course_dominant_career = {}
for course in suitability_predictions.columns:
    suitable_students = suitability_predictions[course] == 1
    if suitable_students.sum() > 0:
        career_counts = df[suitable_students]['CareerTrack'].value_counts(normalize=True)
        course_dominant_career[course] = career_counts.index[0]
    else:
        course_dominant_career[course] = None

In [None]:
# Function to recommend courses based on student input
def recommend_courses(student_careertrack, student_learningstyle=None):
    matching_courses = [course for course, dominant_career in course_dominant_career.items() 
                       if dominant_career == student_careertrack]
    
    if not matching_courses:
        print(f"No courses found matching CareerTrack '{student_careertrack}'.")
        return []
    
    student_data = {
        'Semester': df['Semester'].mode()[0],
        'LearningStyle': student_learningstyle if student_learningstyle else df['LearningStyle'].mode()[0],
        'CareerTrack': student_careertrack,
        'OverallCourseRating': df['OverallCourseRating'].mean(),
        'StudyHours': df['StudyHours'].mean()
    }
    
    for idx, course_info in enumerate(course_columns):
        reason_col = f'Reason_{idx + 1}'
        interest_col = f'InterestLevel_{idx + 1}'
        student_data[reason_col] = df[reason_col].mode()[0] if reason_col in df.columns else 'Unknown'
        student_data[interest_col] = df[interest_col].mean() if interest_col in df.columns else 0.0
    
    student_df = pd.DataFrame([student_data])

    recommendations = []
    models_per_course = {}
    for course in matching_courses:
        model, feature_cols = models_per_course[course]
        course_idx = [c['course'] for c in course_columns].index(course)
        temp_categorical_cols = ['Semester', 'LearningStyle', 'CareerTrack', f'Reason_{course_idx + 1}']
        temp_numerical_cols = ['OverallCourseRating', f'InterestLevel_{course_idx + 1}', 'StudyHours']
        
        for col in temp_categorical_cols + temp_numerical_cols:
            if col not in student_df.columns:
                print(f"Warning: Column '{col}' not found in student_df. Adding with default value.")
                if col in temp_numerical_cols:
                    student_df[col] = 0.0
                else:
                    student_df[col] = 'Unknown'
        
        temp_encoded = pd.get_dummies(student_df[temp_categorical_cols], drop_first=True)
        for col in feature_cols:
            if col not in temp_encoded.columns and col not in temp_numerical_cols:
                temp_encoded[col] = 0
        temp_encoded = temp_encoded[[col for col in feature_cols if col not in temp_numerical_cols]]
        temp_X = pd.concat([student_df[temp_numerical_cols], temp_encoded], axis=1)
        temp_X = temp_X.fillna(temp_X.mean())
        temp_X = temp_X[feature_cols]
        prob = model.predict_proba(temp_X)[0][1]
        recommendations.append((course, prob))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations

In [None]:
# Example: Recommend courses for a student Data Science / AI student 
print("\nStudent Profile: CareerTrack = 'Data Science / AI'")
recommendations = recommend_courses(student_careertrack="Data Science / AI")
for course, prob in recommendations:
    print(f"Course: {course}, Suitability Probability = {prob:.3f} ({course_profiles_summary[course]})")

In [None]:
# Build models_per_course for both baseline and weighted strategies
models_per_course_baseline = {}
models_per_course_weighted = {}

for course in course_columns:
    target_col = course['target']
    interest_col = course['interest']
    reason_col = course['reason']
    course_name = course['course']

    temp_df = df.copy()
    temp_df[target_col] = temp_df[target_col].fillna('No')
    temp_categorical_cols = ['Semester', 'LearningStyle', 'CareerTrack', reason_col]
    temp_numerical_cols = ['OverallCourseRating', interest_col, 'StudyHours']

    for col in temp_categorical_cols:
        temp_df[col] = temp_df[col].fillna(temp_df[col].mode()[0])
    for col in temp_numerical_cols:
        temp_df[col] = temp_df[col].fillna(temp_df[col].mean())

    temp_df[target_col] = temp_df[target_col].map({'Yes': 1, 'No': 0})
    temp_df = temp_df.dropna(subset=[target_col])
    if temp_df.empty:
        continue

    temp_encoded = pd.get_dummies(temp_df[temp_categorical_cols], drop_first=True)
    temp_X = pd.concat([temp_df[temp_numerical_cols], temp_encoded], axis=1)
    temp_y = temp_df[target_col]

    # Baseline model 
    model_baseline = LogisticRegression(max_iter=1000, class_weight='balanced')
    model_baseline.fit(temp_X, temp_y)
    models_per_course_baseline[course_name] = (model_baseline, temp_X.columns.tolist())

    # Weighted model
    model_weighted = LogisticRegression(max_iter=1000, class_weight='balanced')
    model_weighted.fit(temp_X, temp_y)
    models_per_course_weighted[course_name] = (model_weighted, temp_X.columns.tolist())

In [None]:
def recommend_courses_baseline(student_careertrack, student_learningstyle=None):
    matching_courses = [course for course, dominant_career in course_dominant_career.items() 
                       if dominant_career == student_careertrack]
    
    if not matching_courses:
        print(f"No courses found matching CareerTrack '{student_careertrack}'.")
        return []
    
    student_data = {
        'Semester': df['Semester'].mode()[0],
        'LearningStyle': student_learningstyle if student_learningstyle else df['LearningStyle'].mode()[0],
        'CareerTrack': student_careertrack,
        'OverallCourseRating': df['OverallCourseRating'].mean(),
        'StudyHours': df['StudyHours'].mean()
    }
    
    for idx, course_info in enumerate(course_columns):
        reason_col = f'Reason_{idx + 1}'
        interest_col = f'InterestLevel_{idx + 1}'
        student_data[reason_col] = df[reason_col].mode()[0] if reason_col in df.columns else 'Unknown'
        student_data[interest_col] = df[interest_col].mean() if interest_col in df.columns else 0.0
    
    student_df = pd.DataFrame([student_data])

    recommendations = []
    for course in matching_courses:
        model, feature_cols = models_per_course_baseline[course]
        course_idx = [c['course'] for c in course_columns].index(course)
        temp_categorical_cols = ['Semester', 'LearningStyle', 'CareerTrack', f'Reason_{course_idx + 1}']
        temp_numerical_cols = ['OverallCourseRating', f'InterestLevel_{course_idx + 1}', 'StudyHours']
        
        for col in temp_categorical_cols + temp_numerical_cols:
            if col not in student_df.columns:
                print(f"Warning: Column '{col}' not found in student_df. Adding with default value.")
                if col in temp_numerical_cols:
                    student_df[col] = 0.0
                else:
                    student_df[col] = 'Unknown'
        
        temp_encoded = pd.get_dummies(student_df[temp_categorical_cols], drop_first=True)
        for col in feature_cols:
            if col not in temp_encoded.columns and col not in temp_numerical_cols:
                temp_encoded[col] = 0
        temp_encoded = temp_encoded[[col for col in feature_cols if col not in temp_numerical_cols]]
        temp_X = pd.concat([student_df[temp_numerical_cols], temp_encoded], axis=1)
        temp_X = temp_X.fillna(temp_X.mean())
        temp_X = temp_X[feature_cols]
        prob = model.predict_proba(temp_X)[0][1]
        recommendations.append((course, prob))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations

def recommend_courses_weighted(student_careertrack, student_learningstyle=None):
    # ... (same as recommend_courses, but use models_per_course_weighted)
    # Apply the weighting logic (multiply probabilities for ideal career, etc.)
    matching_courses = [course for course, dominant_career in course_dominant_career.items() 
                       if dominant_career == student_careertrack]
    
    if not matching_courses:
        print(f"No courses found matching CareerTrack '{student_careertrack}'.")
        return []
    
    student_data = {
        'Semester': df['Semester'].mode()[0],
        'LearningStyle': student_learningstyle if student_learningstyle else df['LearningStyle'].mode()[0],
        'CareerTrack': student_careertrack,
        'OverallCourseRating': df['OverallCourseRating'].mean(),
        'StudyHours': df['StudyHours'].mean()
    }
    
    for idx, course_info in enumerate(course_columns):
        reason_col = f'Reason_{idx + 1}'
        interest_col = f'InterestLevel_{idx + 1}'
        student_data[reason_col] = df[reason_col].mode()[0] if reason_col in df.columns else 'Unknown'
        student_data[interest_col] = df[interest_col].mean() if interest_col in df.columns else 0.0
    
    student_df = pd.DataFrame([student_data])

    recommendations = []
    for course in matching_courses:
        model, feature_cols = models_per_course_weighted[course]
        course_idx = [c['course'] for c in course_columns].index(course)
        temp_categorical_cols = ['Semester', 'LearningStyle', 'CareerTrack', f'Reason_{course_idx + 1}']
        temp_numerical_cols = ['OverallCourseRating', f'InterestLevel_{course_idx + 1}', 'StudyHours']
        
        for col in temp_categorical_cols + temp_numerical_cols:
            if col not in student_df.columns:
                print(f"Warning: Column '{col}' not found in student_df. Adding with default value.")
                if col in temp_numerical_cols:
                    student_df[col] = 0.0
                else:
                    student_df[col] = 'Unknown'
        
        temp_encoded = pd.get_dummies(student_df[temp_categorical_cols], drop_first=True)
        for col in feature_cols:
            if col not in temp_encoded.columns and col not in temp_numerical_cols:
                temp_encoded[col] = 0
        temp_encoded = temp_encoded[[col for col in feature_cols if col not in temp_numerical_cols]]
        temp_X = pd.concat([student_df[temp_numerical_cols], temp_encoded], axis=1)
        temp_X = temp_X.fillna(temp_X.mean())
        temp_X = temp_X[feature_cols]
        prob = model.predict_proba(temp_X)[0][1]
        recommendations.append((course, prob))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations

In [None]:
from collections import Counter

# For all students in the test set
test_indices = X_test.index
baseline_recs = []
weighted_recs = []
true_careers = []

for idx in test_indices:
    student = df.loc[idx]
    career = student['CareerTrack']
    learning = student['LearningStyle']
    true_careers.append(career)

    # Baseline
    recs_base = recommend_courses_baseline(career, learning)
    top_base = recs_base[0][0] if recs_base else None
    baseline_recs.append(top_base)

    # Weighted
    recs_weight = recommend_courses_weighted(career, learning)
    top_weight = recs_weight[0][0] if recs_weight else None
    weighted_recs.append(top_weight)

In [None]:
from scipy.stats import chi2_contingency

# Contingency tables: rows = true career, columns = recommended course
def build_contingency(recs, true_careers, course_list):
    table = []
    for career in set(true_careers):
        row = []
        for course in course_list:
            count = sum((t == career) and (r == course) for t, r in zip(true_careers, recs))
            row.append(count)
        table.append(row)
    return table

course_list = [c['course'] for c in course_columns]
contingency_baseline = build_contingency(baseline_recs, true_careers, course_list)
contingency_weighted = build_contingency(weighted_recs, true_careers, course_list)

# Chi-square test for independence
chi2_base, p_base, _, _ = chi2_contingency(contingency_baseline)
chi2_weight, p_weight, _, _ = chi2_contingency(contingency_weighted)

print(f"Baseline strategy: Chi2 = {chi2_base:.2f}, p = {p_base:.4f}")
print(f"Weighted strategy: Chi2 = {chi2_weight:.2f}, p = {p_weight:.4f}")

if p_weight < 0.05 and p_weight < p_base:
    print("Weighted strategy shows significantly improved alignment with career aspirations (p < 0.05).")
else:
    print("No significant improvement in alignment with career aspirations.")