### Import data set and get X and y matrix for test & train

In [5]:
import bz2
import zipfile
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# import data
df = pd.read_csv("..\\..\\data\\all_worker_sub10.csv")
# df = pd.read_csv("00-easy_hard_question_cutoff/data/all_worker_sub10.csv") #mac
# Keep only the columns with certain names
columns_to_keep = ['Median', 'direction_pct', 'question_type']
#columns_to_keep = ['Mean', 'Median', 'Mode', 'SD', 'direction_pct','complete_time_median', 'complete_time_sd', 'question_type']
df2 = df.loc[:, columns_to_keep]

# Separate features and target variable
X = df2.drop('question_type', axis=1)  # Features (all columns except "question_type")
y = df2['question_type']               # Target variable ("question_type")

# Create a new DataFrame to store the percentage of "hard" and "easy" rows
y_counts = pd.DataFrame(y.value_counts(normalize=True))
y_counts.columns = ['Percentage']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
# this is a test

In [7]:
X_train

Unnamed: 0,Median,direction_pct
236,-3.0,100
159,-1.5,80
146,1.0,60
69,1.0,60
2,-2.5,90
...,...,...
20,3.0,100
188,0.0,40
71,1.5,50
106,0.0,40


### fit a decision tree

In [8]:

# Train a decision tree model
clf = DecisionTreeClassifier(random_state=42, max_depth = 3)
clf.fit(X_train, y_train)

# Predict the class labels for the test set
y_pred = clf.predict(X_test)

# Compute the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

              precision    recall  f1-score   support

        easy       0.74      0.84      0.78        37
        hard       0.84      0.74      0.79        43

    accuracy                           0.79        80
   macro avg       0.79      0.79      0.79        80
weighted avg       0.79      0.79      0.79        80



In [9]:
from sklearn.tree import export_text

# Print out the decision tree structure
tree_rules = export_text(clf, feature_names=list(X.columns))
print(tree_rules)

|--- direction_pct <= 75.00
|   |--- Median <= -0.75
|   |   |--- Median <= -1.25
|   |   |   |--- class: easy
|   |   |--- Median >  -1.25
|   |   |   |--- class: hard
|   |--- Median >  -0.75
|   |   |--- Median <= 1.25
|   |   |   |--- class: hard
|   |   |--- Median >  1.25
|   |   |   |--- class: easy
|--- direction_pct >  75.00
|   |--- Median <= -1.75
|   |   |--- class: easy
|   |--- Median >  -1.75
|   |   |--- Median <= 2.50
|   |   |   |--- class: easy
|   |   |--- Median >  2.50
|   |   |   |--- class: easy



### Feature selection

 forward feature selection using decision tree

In [10]:
# Create a list to store the selected features
selected_features = []

# Loop through the features and select the best one
for i in range(len(X.columns)):
    best_feature = None
    best_score = 0
    
    for feature in X.columns:
        # Skip the features that have already been selected
        if feature in selected_features:
            continue
        
        # Add the feature to the list of selected features
        candidate_features = selected_features + [feature]
        
        # Train a decision tree model on the selected features
        clf = DecisionTreeClassifier(random_state=42)
        clf.fit(X_train[candidate_features], y_train)
        
        # Predict the class labels for the test set
        y_pred = clf.predict(X_test[candidate_features])
        
        # Compute the F1-score for the test set
        score = f1_score(y_test, y_pred, average='weighted')
        
        # Update the best feature if necessary
        if score > best_score:
            best_feature = feature
            best_score = score
    
    # Add the best feature to the list of selected features
    selected_features.append(best_feature)
    print(f'Selected feature #{i+1}: {best_feature} (F1-score: {best_score:.3f})')

Selected feature #1: direction_pct (F1-score: 0.759)
Selected feature #2: Median (F1-score: 0.763)
