In [301]:
import pandas as pd
import sklearn
import numpy as np
from sklearn import preprocessing
import sklearn.metrics as metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [302]:
features = pd.read_pickle("../data/processed/features.pkl")
features.head(2)

Unnamed: 0,installation_id,12 Monkeys,Air Show,All Star Sorting,Balancing Act,Bird Measurer (Assessment),Bottle Filler (Activity),Bubble Bath,Bug Measurer (Activity),Cart Balancer (Assessment),...,Tree Top City - Level 2,Tree Top City - Level 3,Watering Hole (Activity),Welcome to Lost Lagoon!,game_session,title,num_correct,num_incorrect,accuracy,accuracy_group
0,0006a69f,True,True,True,False,True,True,True,True,False,...,True,True,True,True,6bdf9623adc94d89,Mushroom Sorter (Assessment),1,0,1.0,3
1,0006a69f,True,True,True,False,True,True,True,True,False,...,True,True,True,True,77b8ee947eb84b4e,Bird Measurer (Assessment),0,11,0.0,0


In [303]:
# Labels are the values we want to predict
labels = np.array(features['accuracy_group'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop(['accuracy_group','accuracy','num_correct','num_incorrect','installation_id','game_session'], axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)

In [304]:
features=pd.get_dummies(features, prefix=['title'], columns=['title'])

In [305]:
features.head(2)

Unnamed: 0,12 Monkeys,Air Show,All Star Sorting,Balancing Act,Bird Measurer (Assessment),Bottle Filler (Activity),Bubble Bath,Bug Measurer (Activity),Cart Balancer (Assessment),Cauldron Filler (Assessment),...,Tree Top City - Level 1,Tree Top City - Level 2,Tree Top City - Level 3,Watering Hole (Activity),Welcome to Lost Lagoon!,title_Bird Measurer (Assessment),title_Cart Balancer (Assessment),title_Cauldron Filler (Assessment),title_Chest Sorter (Assessment),title_Mushroom Sorter (Assessment)
0,True,True,True,False,True,True,True,True,False,False,...,True,True,True,True,True,0,0,0,0,1
1,True,True,True,False,True,True,True,True,False,False,...,True,True,True,True,True,1,0,0,0,0


In [306]:
# Convert to numpy array
features = np.array(features)

In [309]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [310]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)


Training Features Shape: (13267, 49)
Training Labels Shape: (13267,)
Testing Features Shape: (4423, 49)
Testing Labels Shape: (4423,)


In [312]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [313]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

In [314]:
def prepare_for_comparison(predictions,labels):
    predictions_formatted=predictions.round()
    labels_formatted=labels.round()
    labels_formatted_pd=pd.DataFrame(data=labels_formatted,columns=[['labels']])
    print(labels_formatted_pd.tail())
    predictions_formatted_pd=pd.DataFrame(predictions_formatted,columns=[['predictions']])
    combined_df=pd.concat([predictions_formatted_pd,labels_formatted_pd],axis=1)
    combined_df.columns=['predictions', 'labels']
    combined_df['predictions']=combined_df['predictions'].map(lambda x : int(x))
    
    return combined_df

In [315]:
combined = prepare_for_comparison(predictions,test_labels)

     labels
4418      3
4419      3
4420      3
4421      3
4422      3


In [316]:
predictions_train = rf.predict(train_features)

In [317]:
combined_train = prepare_for_comparison(predictions_train,train_labels)

      labels
13262      2
13263      3
13264      3
13265      3
13266      3


In [318]:
combined

Unnamed: 0,predictions,labels
0,3,3
1,0,3
2,2,2
3,0,0
4,3,3
...,...,...
4418,2,3
4419,2,3
4420,1,3
4421,3,3


In [319]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4423 entries, 0 to 4422
Data columns (total 2 columns):
predictions    4423 non-null int64
labels         4423 non-null int64
dtypes: int64(2)
memory usage: 69.2 KB


In [320]:
combined_train

Unnamed: 0,predictions,labels
0,1,1
1,3,3
2,1,0
3,3,3
4,3,3
...,...,...
13262,2,2
13263,3,3
13264,3,3
13265,3,3


In [321]:
combined_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13267 entries, 0 to 13266
Data columns (total 2 columns):
predictions    13267 non-null int64
labels         13267 non-null int64
dtypes: int64(2)
memory usage: 207.4 KB


In [322]:
comparison=combined['predictions']==combined['labels']
round(comparison.sum()/len(comparison),2)

0.37

In [323]:
comparison=combined_train['predictions']==combined_train['labels']
round(comparison.sum()/len(comparison),2)

0.59

In [324]:
print(sklearn.metrics.classification_report(combined['labels'], combined['predictions']))

              precision    recall  f1-score   support

           0       0.79      0.24      0.36      1050
           1       0.23      0.43      0.30       611
           2       0.16      0.53      0.25       564
           3       0.76      0.38      0.51      2198

    accuracy                           0.37      4423
   macro avg       0.48      0.39      0.35      4423
weighted avg       0.62      0.37      0.41      4423



In [325]:
print(sklearn.metrics.classification_report(combined_train['labels'], combined_train['predictions']))

              precision    recall  f1-score   support

           0       0.98      0.50      0.66      3179
           1       0.38      0.61      0.47      1800
           2       0.26      0.72      0.38      1641
           3       0.94      0.60      0.73      6647

    accuracy                           0.59     13267
   macro avg       0.64      0.61      0.56     13267
weighted avg       0.79      0.59      0.64     13267



In [326]:
print(round(len(combined_train[combined_train['labels']==0])/len(combined_train),2))
print(round(len(combined_train[combined_train['labels']==1])/len(combined_train),2))
print(round(len(combined_train[combined_train['labels']==2])/len(combined_train),2))
print(round(len(combined_train[combined_train['labels']==3])/len(combined_train),2))

0.24
0.14
0.12
0.5
