In [1]:
import pandas as pd

# Load the dataset from the JSON file
df = pd.read_json('data.json')

# Display the first few rows to verify it loaded correctly
print(df.head())


                                       Misconception Misconception ID  \
0  when students don't understand how to represen...            MaE01   
1  when students don't understand how to represen...            MaE01   
2  when students don't understand how to represen...            MaE01   
3  when students don't understand how to represen...            MaE01   
4  Students misunderstand proportional relationsh...            MaE02   

          Topic  Example Number  \
0  Number sense               1   
1  Number sense               2   
2  Number sense               3   
3  Number sense               4   
4  Number sense               1   

                                            Question Incorrect Answer  \
0  What part is shaded?\nWrite a Fraction\n(Exerc...              1/3   
1  What part is shaded?\nWrite a Fraction\n(Exerc...              2/1   
2  What part is shaded?\nWrite a Fraction\n(Exerc...              2/2   
3  What part is shaded?\nWrite a Fraction\n(Exerc...      

In [2]:
# Filter for rows where 'Question image' is empty, meaning no image is required
text_only_df = df[df['Question image'] == ''].copy()

# Reset the index of the new DataFrame
text_only_df.reset_index(drop=True, inplace=True)

# Display the shape of the original and the new text-only DataFrame
print(f"Original dataset shape: {df.shape}")
print(f"Text-only dataset shape: {text_only_df.shape}")



Original dataset shape: (220, 12)
Text-only dataset shape: (199, 12)


In [3]:
# Save the cleaned data to a new CSV file
text_only_df.to_csv('prepared_dataset.csv', index=False)

print("Prepared dataset saved to prepared_dataset.csv")

Prepared dataset saved to prepared_dataset.csv


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the cleaned, text-only dataset
try:
    prepared_df = pd.read_csv('prepared_dataset.csv')
except FileNotFoundError:
    print("Error: 'prepared_dataset.csv' not found.")
    print("Please run the previous data preparation steps first.")
    # As a fallback for demonstration, create a dummy dataframe
    data = [{'Misconception ID': f'M{i}', 'Question': f'Q{i}', 'Incorrect Answer': f'IA{i}', 'Correct Answer': f'CA{i}'} for i in range(100)]
    prepared_df = pd.DataFrame(data)


# Split the data into training (80%) and validation (20%) sets
# Using random_state=42 ensures that the split is the same every time we run the code
train_df, validation_df = train_test_split(
    prepared_df,
    test_size=0.2,
    random_state=42
)

# Save the datasets to their respective CSV files
train_df.to_csv('train_data.csv', index=False)
validation_df.to_csv('validation_data.csv', index=False)

print(f"Successfully created training and validation files.")
print(f"Training set dimensions: {train_df.shape}")
print(f"Validation set dimensions: {validation_df.shape}")



Successfully created training and validation files.
Training set dimensions: (159, 12)
Validation set dimensions: (40, 12)


In [2]:
import pandas as pd

# Define the file path for your dataset
file_path = 'skill_builder_data.csv'

try:
    # Load the dataset using pandas, specifying the correct encoding
    # 'latin-1' is a common encoding for this dataset that avoids errors
    df = pd.read_csv(file_path, encoding='latin-1')

    # Extract the 'skill name' column, drop any missing values, and get unique entries
    unique_skills = df['skill_name'].dropna().unique()

    # Sort the skills alphabetically for easier reading
    unique_skills.sort()

    # Print each unique skill name
    print("--- Unique Skill Names in the ASSISTments Dataset ---")
    for skill in unique_skills:
        print(skill)

    # Print the total count of unique skills
    print(f"\nTotal number of unique skills: {len(unique_skills)}")

except FileNotFoundError:
    print(f"Error: The file was not found at '{file_path}'")
    print("Please make sure the CSV file is in the same directory as your script, or provide the full path.")
except Exception as e:
    print(f"An error occurred: {e}")



  df = pd.read_csv(file_path, encoding='latin-1')


--- Unique Skill Names in the ASSISTments Dataset ---
Absolute Value
Addition Whole Numbers
Addition and Subtraction Fractions
Addition and Subtraction Integers
Addition and Subtraction Positive Decimals
Algebraic Simplification
Algebraic Solving
Angles - Obtuse, Acute, and Right
Angles on Parallel Lines Cut by a Transversal
Area Circle
Area Irregular Figure
Area Parallelogram
Area Rectangle
Area Trapezoid
Area Triangle
Box and Whisker
Calculations with Similar Figures
Choose an Equation from Given Information
Circle Graph
Circumference 
Complementary and Supplementary Angles
Computation with Real Numbers
Congruence
Conversion of Fraction Decimals Percents
Counting Methods
D.4.8-understanding-concept-of-probabilities
Distributive Property
Divisibility Rules
Division Fractions
Effect of Changing Dimensions of a Shape Prportionally
Equation Solving More Than Two Steps
Equation Solving Two or Fewer Steps
Equivalent Fractions
Estimation
Exponents
Finding Percents
Finding Slope From Equatio

In [11]:
skill_counts = df['skill_name'].value_counts()
print(skill_counts)


skill_name
Equation Solving Two or Fewer Steps         24253
Percent Of                                  22931
Addition and Subtraction Integers           22895
Conversion of Fraction Decimals Percents    20992
Volume Rectangular Prism                    19489
                                            ...  
Midpoint                                       32
Distributive Property                          18
Finding Slope From Situation                    9
Reading a Ruler or Scale                        5
Finding Slope from Ordered Pairs                5
Name: count, Length: 110, dtype: int64


In [10]:
template_skills = df.groupby('template_id')['skill_name'].agg(lambda x: x.value_counts().index[0] if not x.empty else None)
print(template_skills.head(10))

IndexError: index 0 is out of bounds for axis 0 with size 0

In [9]:
# Compare attempt counts for correct vs. incorrect answers for a specific skill
skill_data = df[df['skill_name'] == 'Pythagorean Theorem']
behavior_comparison = skill_data.groupby('correct')['attempt_count'].mean()
print(behavior_comparison)


correct
0    1.960179
1    1.003133
Name: attempt_count, dtype: float64
