In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from transformers import TFBertModel, BertTokenizer
import re

# Data loading and preprocessing
def load_data(questions_path, misconceptions_path, train_labels_path):
    # Load datasets
    questions_df = pd.read_csv(questions_path)
    misconceptions_df = pd.read_csv(misconceptions_path)
    train_labels_df = pd.read_csv(train_labels_path)
    
    # Process training labels
    train_labels = []
    for label_str in train_labels_df['MisconceptionId']:
        labels = [int(x) for x in label_str.split()]
        train_labels.append(labels)
    
    # Create features by combining question and answer text
    features = []
    for idx, row in questions_df.iterrows():
        question_text = re.sub(r'\$.*?\$', ' ', row['QuestionText'])  # Remove LaTeX
        answer_texts = {
            'A': row['AnswerAText'],
            'B': row['AnswerBText'],
            'C': row['AnswerCText'],
            'D': row['AnswerDText']
        }
        
        for qa_id in train_labels_df['QuestionId_Answer']:
            q_id, ans = qa_id.split('_')
            if int(q_id) == row['QuestionId']:
                feature = f"{question_text} [SEP] {answer_texts[ans]}"
                features.append(feature)
    
    return features, train_labels, len(misconceptions_df)

# Model architecture
def create_model(bert_model, num_labels, max_length=512):
    # Input layers
    input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')
    
    # BERT layer
    bert_outputs = bert_model([input_ids, attention_mask])
    sequence_output = bert_outputs[0]
    
    # Pooling and dense layers
    pooled_output = GlobalAveragePooling1D()(sequence_output)
    dropout = Dropout(0.3)(pooled_output)
    dense1 = Dense(512, activation='relu')(dropout)
    dropout2 = Dropout(0.2)(dense1)
    output = Dense(num_labels, activation='sigmoid')(dropout2)
    
    # Create model
    model = tf.keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=output
    )
    
    return model

# Tokenization
def tokenize_data(texts, tokenizer, max_length=512):
    tokenized = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )
    return {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask']
    }

# Training pipeline
def train_misconception_model(features, labels, num_misconceptions):
    # Initialize BERT
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')
    
    # Tokenize data
    tokenized_data = tokenize_data(features, tokenizer)
    
    # Convert labels to multi-hot encoding
    label_matrix = np.zeros((len(labels), num_misconceptions))
    for i, label_list in enumerate(labels):
        label_matrix[i, label_list] = 1
    
    # Split data
    train_inputs, val_inputs, train_labels, val_labels = train_test_split(
        tokenized_data,
        label_matrix,
        test_size=0.2,
        random_state=42
    )
    
    # Create and compile model
    model = create_model(bert_model, num_misconceptions)
    model.compile(
        optimizer=Adam(learning_rate=2e-5),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Training callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=10,
        batch_size=16,
        callbacks=[early_stopping]
    )
    
    return model, tokenizer, history

# Prediction function
def predict_misconceptions(model, tokenizer, texts, threshold=0.5):
    tokenized = tokenize_data(texts, tokenizer)
    predictions = model.predict(tokenized)
    return (predictions > threshold).astype(int)

# Generate submission file
def create_submission(model, tokenizer, test_questions_df, output_path):
    test_features = []
    submission_rows = []
    
    for _, question in test_questions_df.iterrows():
        question_text = re.sub(r'\$.*?\$', ' ', question['QuestionText'])
        answers = {
            'A': question['AnswerAText'],
            'B': question['AnswerBText'],
            'C': question['AnswerCText'],
            'D': question['AnswerDText']
        }
        
        for answer_key in answers:
            qa_pair = f"{question_text} [SEP] {answers[answer_key]}"
            test_features.append(qa_pair)
            
            predictions = predict_misconceptions(
                model,
                tokenizer,
                [qa_pair]
            )[0]
            
            misconception_ids = ' '.join(
                str(i+1) for i in range(len(predictions))
                if predictions[i] == 1
            )
            
            submission_rows.append({
                'QuestionId_Answer': f"{question['QuestionId']}_{answer_key}",
                'MisconceptionId': misconception_ids
            })
    
    submission_df = pd.DataFrame(submission_rows)
    submission_df.to_csv(output_path, index=False)

# Main execution
if __name__ == "__main__":
    # Load data
    features, labels, num_misconceptions = load_data(
        'questions.csv',
        'misconceptions.csv',
        'train_labels.csv'
    )
    
    # Train model
    model, tokenizer, history = train_misconception_model(
        features,
        labels,
        num_misconceptions
    )
    
    # Generate predictions for test set
    test_questions = pd.read_csv('test_questions.csv')
    create_submission(model, tokenizer, test_questions, 'submission.csv')

: 