## Load Libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load dataset
header = ['userid', 'movieid', 'rating', 'timestamp']
data = pd.read_csv('Data/HugeRating.data',sep='\t', names=header)
data.head(5)

In [None]:
# Drop timestamp
features = data.drop('timestamp', axis = 1)
features.head(5)

In [None]:
# Import 'train_test_split'
from sklearn.model_selection import train_test_split
# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features,ratings,test_size=0.2,random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 120, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 30, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

clf_rf = GridSearchCV(rf, random_grid)
clf_rf.fit(X_train, y_train);

In [None]:
print('Parameters currently in use:\n')
print(clf_rf.get_params())

In [None]:
predictions = clf_rf.predict(X_test) # Validate on the test set

In [None]:
errors = abs(predictions - y_test) # Error

In [None]:
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
# Root Mean Squared Error calculation 

from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    return sqrt(mean_squared_error(prediction, ground_truth))

In [None]:
print (' RF RMSE: ' + str(rmse(predictions, y_test)))


In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')


In [None]:
# Performance Visualization

from sklearn.model_selection import cross_val_predict
from matplotlib import pyplot as plt#predicted = cross_val_predict(clf_rf, X_train, y_train, cv=10)

fig, ax = plt.subplots()
ax.scatter(y_test, predictions, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4,linestyle="None")
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()