In [3]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pickle
import logging
from typing import List, Dict, Tuple
import os

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DeepLearningCareerRecommender:
    def __init__(self, embedding_dim=128, hidden_layers=[256, 128, 64]):
        """
        Initialize the Deep Learning Career Recommender

        Args:
            embedding_dim: Dimension of embedding vectors
            hidden_layers: List of hidden layer sizes
        """
        self.embedding_dim = embedding_dim
        self.hidden_layers = hidden_layers
        self.model = None
        self.student_encoder = LabelEncoder()
        self.course_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.num_students = 0
        self.num_courses = 0
        self.feature_dim = 0

    def prepare_features(self, students_df: pd.DataFrame, courses_df: pd.DataFrame) -> np.ndarray:
        """
        Extract and engineer features from student and course data

        Args:
            students_df: DataFrame with student information
            courses_df: DataFrame with course information

        Returns:
            Engineered features array
        """
        features = []

        # Subject performance features
        subjects = ['Mathematics', 'Physics', 'Chemistry', 'Biology', 'English', 'Geography', 'History']

        for _, student in students_df.iterrows():
            student_features = []

            # Parse subjects and create subject performance vector
            student_subjects = student['subjects'].split(', ')
            subject_vector = [1 if subj in student_subjects else 0 for subj in subjects]
            student_features.extend(subject_vector)

            # Parse interests and create interest categories
            interests = student['interests'].split(', ')
            interest_categories = {
                'technology': ['programming', 'computers', 'innovation', 'AI', 'software', 'data', 'machine learning'],
                'healthcare': ['medicine', 'nursing', 'biology', 'helping', 'health', 'medical'],
                'business': ['entrepreneurship', 'marketing', 'finance', 'management', 'economics'],
                'creative': ['art', 'design', 'music', 'writing', 'creative', 'media'],
                'social': ['teaching', 'counseling', 'social work', 'psychology', 'education']
            }

            for category, keywords in interest_categories.items():
                score = sum(1 for interest in interests
                           if any(keyword.lower() in interest.lower() for keyword in keywords))
                student_features.append(score)

            # Academic performance indicators
            stem_subjects = ['Mathematics', 'Physics', 'Chemistry', 'Biology']
            stem_count = sum(1 for subj in student_subjects if subj in stem_subjects)
            student_features.append(stem_count / len(stem_subjects))

            humanities_subjects = ['English', 'Geography', 'History']
            humanities_count = sum(1 for subj in student_subjects if subj in humanities_subjects)
            student_features.append(humanities_count / len(humanities_subjects))

            # Interest diversity
            student_features.append(len(interests))

            features.append(student_features)

        return np.array(features)

    def create_training_data(self, students_df: pd.DataFrame, courses_df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Create training data for the neural network

        Args:
            students_df: DataFrame with student information
            courses_df: DataFrame with course information

        Returns:
            Tuple of (student_ids, course_ids, features, ratings)
        """
        # Encode student and course IDs
        student_ids = self.student_encoder.fit_transform(students_df['student_id'])
        course_ids = self.course_encoder.fit_transform(courses_df['course_name'])

        self.num_students = len(student_ids)
        self.num_courses = len(course_ids)

        # Extract features
        features = self.prepare_features(students_df, courses_df)
        features = self.scaler.fit_transform(features)
        self.feature_dim = features.shape[1]

        # Create synthetic ratings based on subject-interest alignment
        training_data = []

        for i, student in students_df.iterrows():
            student_subjects = set(student['subjects'].split(', '))
            student_interests = set(student['interests'].split(', '))

            for j, course in courses_df.iterrows():
                course_subjects = set(course['subjects'].split(', '))
                course_skills = set(course['skills'].split(', '))

                # Calculate alignment score
                subject_overlap = len(student_subjects.intersection(course_subjects))
                interest_overlap = len(student_interests.intersection(course_skills))

                # Create rating based on alignment (0-1 scale)
                rating = (subject_overlap + interest_overlap) / (len(student_subjects) + len(student_interests))
                rating = min(1.0, rating)  # Cap at 1.0

                training_data.append({
                    'student_id': i,
                    'course_id': j,
                    'features': features[i],
                    'rating': rating
                })

        # Convert to arrays
        training_df = pd.DataFrame(training_data)

        return (
            training_df['student_id'].values,
            training_df['course_id'].values,
            np.array(training_df['features'].tolist()),
            training_df['rating'].values
        )

    def build_model(self):
        """
        Build the neural network architecture
        """
        # Input layers
        student_input = layers.Input(shape=(), name='student_id')
        course_input = layers.Input(shape=(), name='course_id')
        features_input = layers.Input(shape=(self.feature_dim,), name='features')

        # Embedding layers
        student_embedding = layers.Embedding(
            self.num_students,
            self.embedding_dim,
            embeddings_regularizer=tf.keras.regularizers.l2(0.01)
        )(student_input)

        course_embedding = layers.Embedding(
            self.num_courses,
            self.embedding_dim,
            embeddings_regularizer=tf.keras.regularizers.l2(0.01)
        )(course_input)

        # Flatten embeddings
        student_vec = layers.Flatten()(student_embedding)
        course_vec = layers.Flatten()(course_embedding)

        # Concatenate all features
        combined = layers.concatenate([student_vec, course_vec, features_input])

        # Deep layers with dropout and batch normalization
        x = combined
        for hidden_size in self.hidden_layers:
            x = layers.Dense(hidden_size, activation='relu')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Dropout(0.3)(x)

        # Attention mechanism
        attention = layers.Dense(self.hidden_layers[-1], activation='tanh')(x)
        attention = layers.Dense(1, activation='sigmoid')(attention)
        x = layers.multiply([x, attention])

        # Output layer
        output = layers.Dense(1, activation='sigmoid', name='rating')(x)

        # Create model
        model = Model(inputs=[student_input, course_input, features_input], outputs=output)

        # Compile model
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy', 'mae']
        )

        return model

    def train(self, students_df: pd.DataFrame, courses_df: pd.DataFrame,
              epochs=100, batch_size=256, validation_split=0.2):
        """
        Train the deep learning model

        Args:
            students_df: DataFrame with student information
            courses_df: DataFrame with course information
            epochs: Number of training epochs
            batch_size: Batch size for training
            validation_split: Fraction of data for validation
        """
        logger.info("Preparing training data...")

        # Create training data
        student_ids, course_ids, features, ratings = self.create_training_data(students_df, courses_df)

        # Build model
        logger.info("Building neural network model...")
        self.model = self.build_model()

        # Print model summary
        self.model.summary()

        # Callbacks
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=7,
                min_lr=0.00001,
                verbose=1
            )
        ]

        # Train model
        logger.info("Training model...")
        history = self.model.fit(
            [student_ids, course_ids, features],
            ratings,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split,
            callbacks=callbacks,
            verbose=1
        )

        logger.info("Training completed!")
        return history

    def predict_for_student(self, student_data: Dict, courses_df: pd.DataFrame, top_k=5) -> List[Tuple[str, float]]:
        """
        Predict career recommendations for a single student

        Args:
            student_data: Dictionary with 'subjects' and 'interests' keys
            courses_df: DataFrame with course information
            top_k: Number of top recommendations to return

        Returns:
            List of (course_name, confidence_score) tuples
        """
        if self.model is None:
            raise ValueError("Model not trained yet!")

        # Create a temporary DataFrame for the student
        temp_student_df = pd.DataFrame([{
            'student_id': 'temp_student',
            'subjects': ', '.join(student_data['subjects']),
            'interests': ', '.join(student_data['interests'])
        }])

        # Extract features for the student
        student_features = self.prepare_features(temp_student_df, courses_df)
        student_features = self.scaler.transform(student_features)

        # Predict for all courses
        predictions = []

        for idx, course in courses_df.iterrows():
            # Use a dummy student ID (encoded as 0)
            student_id = np.array([0])

            # Encode course ID
            if course['course_name'] in self.course_encoder.classes_:
                course_id = self.course_encoder.transform([course['course_name']])[0]
            else:
                # Handle unseen courses
                course_id = 0

            course_id = np.array([course_id])

            # Predict rating
            rating = self.model.predict([student_id, course_id, student_features], verbose=0)[0][0]

            predictions.append((course['course_name'], float(rating)))

        # Sort by rating and return top K
        predictions.sort(key=lambda x: x[1], reverse=True)

        return predictions[:top_k]

    def save_model(self, filepath: str):
        """Save the trained model and preprocessors"""
        if self.model is None:
            raise ValueError("No model to save!")

        # Save model
        self.model.save(f"{filepath}_model.h5")

        # Save preprocessors
        with open(f"{filepath}_preprocessors.pkl", 'wb') as f:
            pickle.dump({
                'student_encoder': self.student_encoder,
                'course_encoder': self.course_encoder,
                'scaler': self.scaler,
                'num_students': self.num_students,
                'num_courses': self.num_courses,
                'feature_dim': self.feature_dim,
                'embedding_dim': self.embedding_dim,
                'hidden_layers': self.hidden_layers
            }, f)

        logger.info(f"Model saved to {filepath}")

    def load_model(self, filepath: str):
        """Load a trained model and preprocessors"""
        # Load model
        self.model = tf.keras.models.load_model(f"{filepath}_model.h5")

        # Load preprocessors
        with open(f"{filepath}_preprocessors.pkl", 'rb') as f:
            data = pickle.load(f)
            self.student_encoder = data['student_encoder']
            self.course_encoder = data['course_encoder']
            self.scaler = data['scaler']
            self.num_students = data['num_students']
            self.num_courses = data['num_courses']
            self.feature_dim = data['feature_dim']
            self.embedding_dim = data['embedding_dim']
            self.hidden_layers = data['hidden_layers']

        logger.info(f"Model loaded from {filepath}")

# Example usage and training script
def main():
    # Load data
    students_df = pd.read_csv('./sample_data/student_data.csv', on_bad_lines='skip')
    courses_df = pd.read_csv('./sample_data/Courses.csv')

    # Initialize recommender
    recommender = DeepLearningCareerRecommender(
        embedding_dim=128,
        hidden_layers=[256, 128, 64]
    )

    # Train model
    history = recommender.train(
        students_df,
        courses_df,
        epochs=50,
        batch_size=128
    )

    # Save model
    recommender.save_model('deep_learning_career_model')

    # Test prediction
    test_student = {
        'subjects': ['Mathematics', 'Physics'],
        'interests': ['Artificial Intelligence', 'Machine Learning']
    }

    recommendations = recommender.predict_for_student(test_student, courses_df)

    print("Recommendations for test student:")
    for course, confidence in recommendations:
        print(f"  {course}: {confidence:.3f}")

if __name__ == "__main__":
    main()

Epoch 1/50
[1m1966/1966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 13ms/step - accuracy: 0.4392 - loss: 0.4900 - mae: 0.1434 - val_accuracy: 0.0550 - val_loss: 0.6637 - val_mae: 0.1499 - learning_rate: 0.0010
Epoch 2/50
[1m1966/1966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 13ms/step - accuracy: 0.4488 - loss: 0.3824 - mae: 0.0866 - val_accuracy: 0.0550 - val_loss: 0.6696 - val_mae: 0.1494 - learning_rate: 0.0010
Epoch 3/50
[1m1966/1966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 13ms/step - accuracy: 0.4486 - loss: 0.3709 - mae: 0.0678 - val_accuracy: 0.0550 - val_loss: 0.6601 - val_mae: 0.1368 - learning_rate: 0.0010
Epoch 4/50
[1m1966/1966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 13ms/step - accuracy: 0.4471 - loss: 0.3643 - mae: 0.0559 - val_accuracy: 0.0550 - val_loss: 0.6481 - val_mae: 0.1118 - learning_rate: 0.0010
Epoch 5/50
[1m1966/1966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 13ms/step - accuracy: 0.4487 - l



Recommendations for test student:
  Bachelor of Education (French With IT): 0.597
  Bachelor of Education Arts: 0.594
  Bachelor of Science (Animal Health Production & Processing): 0.591
  Bachelor of Science (Applied Aquatic Science): 0.591
  Bachelor of Science (Analytical Chemistry): 0.590
