In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string


In [21]:
# Load datasets
train_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
test_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
misconception_mapping = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')

# Preview data
display(train_df.head())
display(test_df.head())
display(misconception_mapping.head())


Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
0,0,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,,,,1672.0
1,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,2142.0,143.0,2142.0,
2,2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,1287.0,,1287.0,1073.0
3,3,2377,Recall and use the intersecting diagonals prop...,88,Properties of Quadrilaterals,C,The angles highlighted on this rectangle with ...,acute,obtuse,\( 90^{\circ} \),Not enough information,1180.0,1180.0,,1180.0
4,4,3387,Substitute positive integer values into formul...,67,Substitution into Formula,A,The equation \( f=3 r^{2}+3 \) is used to find...,\( 30 \),\( 27 \),\( 51 \),\( 24 \),,,,1818.0


Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText
0,1869,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets
1,1870,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify
2,1871,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct


Unnamed: 0,MisconceptionId,MisconceptionName
0,0,Does not know that angles in a triangle sum to...
1,1,Uses dividing fractions method for multiplying...
2,2,Believes there are 100 degrees in a full turn
3,3,Thinks a quadratic without a non variable term...
4,4,Believes addition of terms and powers of terms...


In [22]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

# Apply cleaning
train_df['QuestionText'] = train_df['QuestionText'].apply(clean_text)
test_df['QuestionText'] = test_df['QuestionText'].apply(clean_text)

# Combine answer texts
for col in ['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']:
    train_df[col] = train_df[col].apply(clean_text)
    test_df[col] = test_df[col].apply(clean_text)

# Merge all text columns into a single feature
train_df['combined_text'] = (
    train_df['QuestionText'] + " " +
    train_df['AnswerAText'] + " " +
    train_df['AnswerBText'] + " " +
    train_df['AnswerCText'] + " " +
    train_df['AnswerDText']
)

test_df['combined_text'] = (
    test_df['QuestionText'] + " " +
    test_df['AnswerAText'] + " " +
    test_df['AnswerBText'] + " " +
    test_df['AnswerCText'] + " " +
    test_df['AnswerDText']
)


In [23]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train = vectorizer.fit_transform(train_df['combined_text'])
X_test = vectorizer.transform(test_df['combined_text'])

# Target variable
y_train = train_df[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId']]


In [24]:
# Check for missing values in y_train
print("Missing values in y_train:")
print(y_train.isnull().sum())

# If there are missing values, inspect the problematic rows
missing_rows = train_df[train_df[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId']].isnull().any(axis=1)]
print("Rows with missing values in y_train:")
display(missing_rows)


Missing values in y_train:
MisconceptionAId    734
MisconceptionBId    751
MisconceptionCId    789
dtype: int64
Rows with missing values in y_train:


Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId,combined_text
0,0,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,times \n\nwhere do the brackets need to go to ...,times,times,times,does not need brackets,,,,1672.0,times \n\nwhere do the brackets need to go to ...
2,2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,tom and katie are discussing the plants wit...,only\ntom,only\nkatie,both tom and katie,neither is correct,1287.0,,1287.0,1073.0,tom and katie are discussing the plants wit...
3,3,2377,Recall and use the intersecting diagonals prop...,88,Properties of Quadrilaterals,C,the angles highlighted on this rectangle with ...,acute,obtuse,circ,not enough information,1180.0,1180.0,,1180.0,the angles highlighted on this rectangle with ...
4,4,3387,Substitute positive integer values into formul...,67,Substitution into Formula,A,the equation f r is used to find values in t...,,,,,,,,1818.0,the equation f r is used to find values in t...
6,6,376,Convert two digit integer percentages to fract...,238,Converting between Fractions and Percentages,B,convert this percentage to a fraction,frac,frac,frac,none of these,329.0,,847.0,329.0,convert this percentage to a fraction frac fra...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1864,1864,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,C,what is the range of the following numbers,,,,,2456.0,691.0,,1349.0,what is the range of the following numbers
1865,1865,2695,"Describe an enlargement, with no centre of enl...",90,Length Scale Factors in Similar Shapes,B,shape q is an enlargement of shape p \nwhat...,div,div,times,,1500.0,,2442.0,1258.0,shape q is an enlargement of shape p \nwhat...
1866,1866,854,Use the order of operations to carry out calcu...,33,BIDMAS,B,what does the following equal\n\n times,,,,,,,2306.0,1507.0,what does the following equal\n\n times
1867,1867,2634,Distinguish between congruency and similarity,274,Congruency in Other Shapes,B,tom and katie are discussing congruence and si...,only\ntom,only katie,both tom and katie,neither is correct,2312.0,,2312.0,2312.0,tom and katie are discussing congruence and si...


In [25]:
# Calculate the percentage of missing values in each target column
missing_percentage = train_df[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId']].isnull().mean() * 100

# Display the percentage of missing values
print("Percentage of missing values:")
print(missing_percentage)

# Calculate overall missing values as a percentage of total rows
overall_missing_percentage = train_df[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId']].isnull().any(axis=1).mean() * 100
print(f"\nOverall percentage of rows with missing values: {overall_missing_percentage:.2f}%")


Percentage of missing values:
MisconceptionAId    39.272338
MisconceptionBId    40.181915
MisconceptionCId    42.215088
dtype: float64

Overall percentage of rows with missing values: 88.39%


In [26]:
# Separate rows with complete and incomplete target data
train_non_missing = train_df.dropna(subset=['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId'])
train_missing = train_df[train_df[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId']].isnull().any(axis=1)]

# Extract targets for the non-missing rows
y_train_non_missing = train_non_missing[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId']]

# Extract features for both subsets
X_train_non_missing = vectorizer.fit_transform(train_non_missing['combined_text'])
X_train_missing = vectorizer.transform(train_missing['combined_text'])


In [27]:
# Initialize and train the model
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model.fit(X_train_non_missing, y_train_non_missing)

print("Model trained successfully on non-missing rows.")


Model trained successfully on non-missing rows.


In [28]:
# Predict missing values
y_missing_predictions = model.predict(X_train_missing)

# Ensure train_missing is a proper copy to avoid the warning
train_missing = train_missing.copy()

# Assign predicted values back to the missing rows using .loc
train_missing.loc[:, ['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId']] = y_missing_predictions


In [29]:
# Combine the datasets
train_completed = pd.concat([train_non_missing, train_missing]).sort_index()

# Verify that there are no missing values left
print("Missing values after combining:")
print(train_completed[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId']].isnull().sum())


Missing values after combining:
MisconceptionAId    0
MisconceptionBId    0
MisconceptionCId    0
dtype: int64


In [30]:
# Prepare the final training data
X_train_final = vectorizer.transform(train_completed['combined_text'])
y_train_final = train_completed[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId']]

# Train the final model
model_final = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model_final.fit(X_train_final, y_train_final)

print("Final model trained on the completed dataset.")


Final model trained on the completed dataset.


In [31]:
# Transform the test set
X_test = vectorizer.transform(test_df['combined_text'])

# Make predictions on the test set
y_test_predictions = model_final.predict(X_test)


In [32]:
question_id_answer_length = len([f"{qid}_{opt}" for qid in test_df['QuestionId'] for opt in ['B', 'C', 'D']])
prediction_length = len(y_test_predictions)

print(f"Length of QuestionId_Answer: {question_id_answer_length}")
print(f"Length of y_test_predictions: {prediction_length}")


Length of QuestionId_Answer: 9
Length of y_test_predictions: 3


In [33]:
# Generate predictions for each QuestionId's distractors (B, C, D)
y_test_predictions_expanded = []
for preds in y_test_predictions:
    y_test_predictions_expanded.extend([preds] * 3)  # Repeat predictions for B, C, D


In [34]:
# Generate QuestionId_Answer list
question_id_answer = [f"{qid}_{opt}" for qid in test_df['QuestionId'] for opt in ['B', 'C', 'D']]
print(f"Generated QuestionId_Answer list with length: {len(question_id_answer)}")


Generated QuestionId_Answer list with length: 9


In [35]:
# Generate predictions for each QuestionId's distractors (B, C, D)
y_test_predictions_expanded = []
for preds in y_test_predictions:
    y_test_predictions_expanded.extend([preds] * 3)  # Repeat predictions for B, C, D

# Verify the expanded predictions length
print(f"Expanded predictions length: {len(y_test_predictions_expanded)}")
assert len(y_test_predictions_expanded) == len(question_id_answer), "Mismatched lengths after expansion!"


Expanded predictions length: 9


In [36]:
# Construct the submission DataFrame
submission = pd.DataFrame({
    "QuestionId_Answer": question_id_answer,
    "MisconceptionId": [" ".join(map(str, preds)) for preds in y_test_predictions_expanded]
})

# Preview the submission DataFrame
print("Submission preview:")
print(submission.head())


Submission preview:
  QuestionId_Answer       MisconceptionId
0            1869_B  1507.0 1708.0 1507.0
1            1869_C  1507.0 1708.0 1507.0
2            1869_D  1507.0 1708.0 1507.0
3            1870_B   2142.0 143.0 2142.0
4            1870_C   2142.0 143.0 2142.0


In [37]:
# Validate submission format
assert submission.shape[0] == len(question_id_answer), "Row count mismatch!"
assert list(submission.columns) == ['QuestionId_Answer', 'MisconceptionId'], "Column names mismatch!"

print("Submission DataFrame is valid!")


Submission DataFrame is valid!


In [38]:
import pandas as pd

# Construct the submission DataFrame
submission = pd.DataFrame({
    "QuestionId_Answer": question_id_answer,
    "MisconceptionId": [" ".join(map(str, map(int, map(float, preds)))) for preds in y_test_predictions_expanded]
})

# Validate the submission format
assert list(submission.columns) == ['QuestionId_Answer', 'MisconceptionId'], "Column names mismatch!"
assert len(submission) == len(question_id_answer), "Row count mismatch!"

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved successfully as 'submission.csv'!")

# Preview the first few rows of the submission
print("Preview of the submission:")
print(submission.head())




Submission file saved successfully as 'submission.csv'!
Preview of the submission:
  QuestionId_Answer MisconceptionId
0            1869_B  1507 1708 1507
1            1869_C  1507 1708 1507
2            1869_D  1507 1708 1507
3            1870_B   2142 143 2142
4            1870_C   2142 143 2142


In [41]:
# Load the sample submission file
sample_submission_path = "/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv"  # Adjust the path as needed
sample_submission = pd.read_csv(sample_submission_path)

# Validate row count
print(f"Number of rows in submission: {len(submission)}")
print(f"Expected rows: {len(sample_submission)}")
assert len(submission) == len(sample_submission), "Row count mismatch!"



Number of rows in submission: 9
Expected rows: 9


In [42]:
# Define the expected number of rows
expected_rows = 9  # Replace with the correct number if needed

# Validate row count
print(f"Number of rows in submission: {len(submission)}")
print(f"Expected rows: {expected_rows}")
assert len(submission) == expected_rows, "Row count mismatch!"


Number of rows in submission: 9
Expected rows: 9


In [43]:
import pandas as pd

# Construct the submission DataFrame
submission = pd.DataFrame({
    "QuestionId_Answer": question_id_answer,
    "MisconceptionId": [" ".join(map(str, map(int, map(float, preds)))) for preds in y_test_predictions_expanded]
})

# Validate the submission format
assert list(submission.columns) == ['QuestionId_Answer', 'MisconceptionId'], "Column names mismatch!"

# Define the expected number of rows
expected_rows = 9  # Replace with the actual number based on the competition
print(f"Number of rows in submission: {len(submission)}")
print(f"Expected rows: {expected_rows}")
assert len(submission) == expected_rows, "Row count mismatch!"

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved successfully as 'submission.csv'!")

# Preview the first few rows of the submission
print("Preview of the submission:")
print(submission.head())


Number of rows in submission: 9
Expected rows: 9
Submission file saved successfully as 'submission.csv'!
Preview of the submission:
  QuestionId_Answer MisconceptionId
0            1869_B  1507 1708 1507
1            1869_C  1507 1708 1507
2            1869_D  1507 1708 1507
3            1870_B   2142 143 2142
4            1870_C   2142 143 2142
