In [62]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import datetime as dt
import csv
from math import sqrt


def rfr_model(X, y, max_depth_gsc):
    # Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': range(3, max_depth_gsc),
            'n_estimators': (10, 50, 100, 200, 500),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_

    rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"],
                                random_state=False, verbose=False, max_features="sqrt")

    rfr.fit(X, y)

    return rfr

path = "/Users/Talha/Documents/Master (Big Data)/Semester 3/5001 - Foundations of Data Analytics/Individual Project/"
max_number_of_examples = 10000

columns = ['is_free', 'price', 'genres', 'categories', 'tags', 'purchase_date', 'release_date', 'total_positive_reviews', 'total_negative_reviews']
labels = ['playtime_forever']

columns_train = pd.read_csv(path + "train.csv", usecols = columns, nrows=max_number_of_examples).fillna(0)
columns_train['is_free'].replace(False, 0, regex=True, inplace=True)
columns_train['is_free'].replace(['True'], 1, regex=True, inplace=True)
split_columns_train = columns_train[['is_free', 'price', 'total_positive_reviews', 'total_negative_reviews']]
split_columns_train = split_columns_train.join(columns_train['genres'].str.get_dummies(sep=',')).join(columns_train['categories'].str.get_dummies(sep=','), lsuffix='_genre', rsuffix='_category').join(columns_train['tags'].str.get_dummies(sep=','), lsuffix='_notTag', rsuffix='_tag')
columns_train['release_year'] = pd.DatetimeIndex(columns_train['release_date']).year
columns_train['release_month'] = pd.DatetimeIndex(columns_train['release_date']).month
columns_train['release_day'] = pd.DatetimeIndex(columns_train['release_date']).day
columns_train['release_weekday'] = pd.DatetimeIndex(columns_train['release_date']).dayofweek
columns_train['purchase_year'] = pd.DatetimeIndex(columns_train['purchase_date']).year
columns_train['purchase_month'] = pd.DatetimeIndex(columns_train['purchase_date']).month
columns_train['purchase_day'] = pd.DatetimeIndex(columns_train['purchase_date']).day
columns_train['purchase_weekday'] = pd.DatetimeIndex(columns_train['purchase_date']).dayofweek
columns_train['purchase_date'] = pd.to_datetime(columns_train['purchase_date'])
train_features_data = columns_train[['release_year','release_month','release_day','release_weekday','purchase_year','purchase_month','purchase_day','purchase_weekday']].join(pd.get_dummies(split_columns_train))
train_features = train_features_data.columns
train_labels_data = pd.read_csv(path + "train.csv", usecols = labels, nrows=max_number_of_examples)

columns_test = pd.read_csv(path + "test.csv", usecols = columns, nrows=max_number_of_examples).fillna(0)
columns_test['is_free'].replace(False, 0, regex=True, inplace=True)
columns_test['is_free'].replace(True, 1, regex=True, inplace=True)
split_columns_test = columns_test[['is_free', 'price', 'total_positive_reviews', 'total_negative_reviews']]
split_columns_test = split_columns_test.join(columns_test['genres'].str.get_dummies(sep=',')).join(columns_test['categories'].str.get_dummies(sep=','), lsuffix='_genre', rsuffix='_category').join(columns_test['tags'].str.get_dummies(sep=','), lsuffix='_notTag', rsuffix='_tag')
columns_test['release_year'] = pd.DatetimeIndex(columns_test['release_date']).year
columns_test['release_month'] = pd.DatetimeIndex(columns_test['release_date']).month
columns_test['release_day'] = pd.DatetimeIndex(columns_test['release_date']).day
columns_test['release_weekday'] = pd.DatetimeIndex(columns_test['release_date']).dayofweek
columns_test['purchase_year'] = pd.DatetimeIndex(columns_test['purchase_date']).year
columns_test['purchase_month'] = pd.DatetimeIndex(columns_test['purchase_date']).month
columns_test['purchase_day'] = pd.DatetimeIndex(columns_test['purchase_date']).day
columns_test['purchase_weekday'] = pd.DatetimeIndex(columns_test['purchase_date']).dayofweek
columns_test['purchase_date'] = pd.to_datetime(columns_test['purchase_date'])
test_features_data = columns_test[['release_year','release_month','release_day','release_weekday','purchase_year','purchase_month','purchase_day','purchase_weekday']].join(pd.get_dummies(split_columns_test))
test_features = test_features_data.columns

for train_feature in train_features:
    if train_feature not in test_features_data:
        test_features_data[train_feature] = 0

for test_feature in test_features:
    if test_feature not in train_features:
        test_features_data.drop(columns=[test_feature], inplace=True)
        print("dropped " + test_feature)

dropped Cold War
dropped GameMaker
dropped Lore-Rich
dropped Modern
dropped On-Rails Shooter
dropped Sexual Content


In [65]:
max_depth_gsc = 20
rfr = rfr_model(train_features_data, train_labels_data, max_depth_gsc)

predictions = rfr.predict(test_features_data)

#predictions = rfr.predict(train_features_data)

predictions

  self.best_estimator_.fit(X, y, **fit_params)


array([0.95840057, 1.50646992, 0.9970157 , 2.55960494, 4.30132996,
       3.02010886, 2.65226881, 1.24594845, 3.69448328, 5.70826811,
       3.4378167 , 3.04602883, 6.35336976, 1.75018227, 1.46442918,
       1.22653316, 2.96124632, 4.46965885, 1.93025926, 0.83444975,
       1.70260218, 2.30073009, 1.14376013, 0.78101868, 1.67193317,
       1.65668611, 1.11489371, 4.06897879, 4.19361015, 2.25903517,
       2.70314864, 5.73553467, 2.33756552, 2.58880621, 2.9592602 ,
       2.8883327 , 3.09539511, 2.70830465, 2.89356724, 2.40968845,
       2.57414258, 3.60147526, 4.69455285, 1.47542334, 4.19896847,
       7.71966294, 1.28392911, 3.81706765, 3.17987422, 2.57392591,
       6.46220241, 1.71197653, 1.71227182, 2.22808617, 2.23633221,
       4.50971963, 1.63996223, 3.42096195, 3.09589073, 4.02164424,
       3.41318064, 1.32482159, 2.82585071, 6.80345105, 0.76214837,
       1.45838976, 2.06953407, 2.85343272, 4.17504228, 0.80064618,
       2.41912171, 2.87124117, 1.98663483, 6.98328544, 2.32055

In [64]:
sqrt(mean_squared_error(predictions, train_labels_data))

4.119664432543444

In [59]:
with open('sample_submission.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['id','playtime_forever'])
    id = 0
    for prediction in predictions:
        spamwriter.writerow([id, prediction])
        id+=1