In [162]:
import pandas as pd
import numpy as np
import sklearn
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

In [163]:
# Import data for training and prediction
train = pd.read_csv('data_for_training.csv')
predictdf = pd.read_csv('data_for_prediction.csv')

In [164]:
# Remove unnecessary columns
y = train['performance_task']
final_id = pd.DataFrame(predictdf['user_exercise_id'])

train = train.drop(['performance_task', 'exercise_duration', 'created_at', 'user_exercise_id', 'exercise_time_s'], axis = 1)
predictdf = predictdf.drop(['created_at', 'user_exercise_id'], axis = 1)

print(train.head(1))
print('------------------------------------------------')
print(predictdf.head(1))
print('------------------------------------------------')
print("Shape of train:", train.shape)
print("Shape of predictdf:", predictdf.shape)

   user_id  nr_tasks    time_obs  item_id       field            area_name  \
0        1         1  2021-04-12       23  vocabulary  Speaking Strategies   

   area_order          section_name  section_order  
0          20  Asking and Answering            130  
------------------------------------------------
   user_id  nr_tasks    time_obs  item_id    field area_name  area_order  \
0        3        15  2021-02-28      126  grammar      Verb          90   

  section_name  section_order  
0    Separable            550  
------------------------------------------------
Shape of train: (1932, 9)
Shape of predictdf: (1288, 9)


In [165]:
# Label encoded
#train_encoded = pd.get_dummies(train, drop_first = True)
#predictdf_encoded = pd.get_dummies(predictdf, drop_first = True)
train = train.apply(preprocessing.LabelEncoder().fit_transform)
predictdf = predictdf.apply(preprocessing.LabelEncoder().fit_transform)
print("Shape of train:", train.shape)
print("Shape of predictdf:", predictdf.shape)

Shape of train: (1932, 9)
Shape of predictdf: (1288, 9)


In [166]:
# Make sure that date is in datetime format
train['time_obs'] = pd.to_datetime(train['time_obs'])

# Transform date into string - train
train['time_string'] = train['time_obs'].dt.strftime('%Y%m%d')
train = train.drop('time_obs', axis = 1)
train.head()


# Make sure that date is in datetime format
predictdf['time_obs'] = pd.to_datetime(predictdf['time_obs'])

# Transform date into string - predict
predictdf['time_string'] = predictdf['time_obs'].dt.strftime('%Y%m%d')
predictdf = predictdf.drop('time_obs', axis = 1)
predictdf.head()

print("Shape of train:", train.shape)
print("Shape of predictdf:", predictdf.shape)

Shape of train: (1932, 9)
Shape of predictdf: (1288, 9)


In [167]:
# Split the dataset into the training dataset and the test dataset.
def split_data(X, y, split_coeff):
    N, _ = X.shape 
    train_size = int(split_coeff * N) 
    X_train = X[:train_size] 
    y_train = y[:train_size]
    X_test = X[train_size:] 
    y_test = y[train_size:]
    
    return X_train, y_train, X_test, y_test

# Define X and y and call the split fct
X = train
X_train, y_train, X_test, y_test = split_data(X, y, 0.8) 

# check the size of the splitted dataset
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)
print("Shape of predictdf:", predictdf.shape)

Shape of X_train: (1545, 9)
Shape of y_train: (1545,)
Shape of X_test: (387, 9)
Shape of y_test: (387,)
Shape of predictdf: (1288, 9)


In [168]:
############
# MODEL 2: #
############
# Random Forest without accounting for different agents

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 100, 
                           random_state = 42, 
                           max_depth = 5, 
                           min_samples_split =2)

# Train the model on training data
rf.fit(X_train, y_train)

# Use the forest's predict method on the test data
predict_rf = rf.predict(X_test)

# Calculate the absolute errors
errors_rf = abs(predict_rf - y_test)

# Print out the mean absolute error (mae)
print('Random Forest Mean Absolute Error on test set:', round(np.mean(errors_rf), 2))

# Output prediction
predict_output = pd.DataFrame(rf.predict(predictdf))
final_output = pd.concat([final_id,predict_output],axis = 1)

print("Shape of final_output:", final_output.shape)
print(final_output.head(10))

Random Forest Mean Absolute Error on test set: 39.5
Shape of final_output: (1288, 2)
   user_exercise_id           0
0                 3   12.332651
1                 6  212.172633
2                12   15.583501
3                14   15.583501
4                15   15.583501
5                19   16.168810
6                21   13.016364
7                22   13.016364
8                26   16.168810
9                28   15.583501


In [169]:
rfc=RandomForestRegressor(random_state=42)

param_grid = { 
    'n_estimators': [50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'min_samples_split' : [2,4,8],
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_split': [2, 4, 8],
                         'n_estimators': [50, 100, 200, 500]})

In [170]:
# Print the result of the Grid search - the optimal parameters
CV_rfc.best_params_

{'max_depth': 6,
 'max_features': 'sqrt',
 'min_samples_split': 8,
 'n_estimators': 100}

In [171]:
# use the best hyperparameters from the gridsearch
rfc1 = RandomForestRegressor(random_state=42,
                             max_features='auto',
                             n_estimators= 100,
                             max_depth=6,
                             min_samples_split=8
                            )

# train the model
rfc1.fit(X_train, y_train)

# Use the forest's predict method on the test data
pred = rfc1.predict(X_test)

# Calculate the absolute errors
errors_rf2 = abs(pred - y_test)

# Print out the mean absolute error (mae)
print('Random Forest Grid Search Mean Absolute Error on test set:', round(np.mean(errors_rf2), 2))

# Output prediction
predict_output2 = pd.DataFrame(rfc1.predict(predictdf))
final_output2 = pd.concat([final_id,predict_output2],axis = 1)

print("Shape of final_output:", final_output2.shape)
print(final_output.head(10))



Random Forest Grid Search Mean Absolute Error on test set: 38.28
Shape of final_output: (1288, 2)
   user_exercise_id           0
0                 3   12.332651
1                 6  212.172633
2                12   15.583501
3                14   15.583501
4                15   15.583501
5                19   16.168810
6                21   13.016364
7                22   13.016364
8                26   16.168810
9                28   15.583501
