In [85]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import datetime as dt
import csv
from math import sqrt


def rfr_model(X, y, max_depth_gsc):
    # Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': range(3, max_depth_gsc),
            'n_estimators': (10, 50, 100, 200, 500),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_

    rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"],
                                random_state=False, verbose=False, max_features="sqrt")

    rfr.fit(X, y)

    return rfr

path = "/Users/Talha/Documents/Master (Big Data)/Semester 3/5001 - Foundations of Data Analytics/Individual Project/"
max_number_of_examples = 10000

columns = ['is_free', 'price', 'genres', 'categories', 'tags', 'purchase_date', 'release_date', 'total_positive_reviews', 'total_negative_reviews']
labels = ['playtime_forever']

columns_train = pd.read_csv(path + "train.csv", usecols = columns, nrows=max_number_of_examples).fillna(0)
columns_train['is_free'].replace(False, 0, regex=True, inplace=True)
columns_train['is_free'].replace(['True'], 1, regex=True, inplace=True)
split_columns_train = columns_train[['is_free', 'price', 'total_positive_reviews', 'total_negative_reviews']]
split_columns_train = split_columns_train.join(columns_train['genres'].str.get_dummies(sep=',')).join(columns_train['categories'].str.get_dummies(sep=','), lsuffix='_genre', rsuffix='_category').join(columns_train['tags'].str.get_dummies(sep=','), lsuffix='_notTag', rsuffix='_tag')
columns_train['release_year'] = pd.DatetimeIndex(columns_train['release_date']).year
columns_train['release_month'] = pd.DatetimeIndex(columns_train['release_date']).month
columns_train['release_day'] = pd.DatetimeIndex(columns_train['release_date']).day
columns_train['release_weekday'] = pd.DatetimeIndex(columns_train['release_date']).dayofweek
columns_train['purchase_year'] = pd.DatetimeIndex(columns_train['purchase_date']).year
columns_train['purchase_month'] = pd.DatetimeIndex(columns_train['purchase_date']).month
columns_train['purchase_day'] = pd.DatetimeIndex(columns_train['purchase_date']).day
columns_train['purchase_weekday'] = pd.DatetimeIndex(columns_train['purchase_date']).dayofweek
columns_train['purchase_date'] = pd.to_datetime(columns_train['purchase_date'])
train_features_data = columns_train[['release_year','release_month','release_day','release_weekday','purchase_year','purchase_month','purchase_day','purchase_weekday']].join(pd.get_dummies(split_columns_train))
train_features = train_features_data.columns
train_labels_data = pd.read_csv(path + "train.csv", usecols = labels, nrows=max_number_of_examples)

columns_test = pd.read_csv(path + "test.csv", usecols = columns, nrows=max_number_of_examples).fillna(0)
columns_test['is_free'].replace(False, 0, regex=True, inplace=True)
columns_test['is_free'].replace(True, 1, regex=True, inplace=True)
split_columns_test = columns_test[['is_free', 'price', 'total_positive_reviews', 'total_negative_reviews']]
split_columns_test = split_columns_test.join(columns_test['genres'].str.get_dummies(sep=',')).join(columns_test['categories'].str.get_dummies(sep=','), lsuffix='_genre', rsuffix='_category').join(columns_test['tags'].str.get_dummies(sep=','), lsuffix='_notTag', rsuffix='_tag')
columns_test['release_year'] = pd.DatetimeIndex(columns_test['release_date']).year
columns_test['release_month'] = pd.DatetimeIndex(columns_test['release_date']).month
columns_test['release_day'] = pd.DatetimeIndex(columns_test['release_date']).day
columns_test['release_weekday'] = pd.DatetimeIndex(columns_test['release_date']).dayofweek
columns_test['purchase_year'] = pd.DatetimeIndex(columns_test['purchase_date']).year
columns_test['purchase_month'] = pd.DatetimeIndex(columns_test['purchase_date']).month
columns_test['purchase_day'] = pd.DatetimeIndex(columns_test['purchase_date']).day
columns_test['purchase_weekday'] = pd.DatetimeIndex(columns_test['purchase_date']).dayofweek
columns_test['purchase_date'] = pd.to_datetime(columns_test['purchase_date'])
test_features_data = columns_test[['release_year','release_month','release_day','release_weekday','purchase_year','purchase_month','purchase_day','purchase_weekday']].join(pd.get_dummies(split_columns_test))
test_features = test_features_data.columns

for train_feature in train_features:
    if train_feature not in test_features_data:
        test_features_data[train_feature] = 0

for test_feature in test_features:
    if test_feature not in train_features:
        test_features_data.drop(columns=[test_feature], inplace=True)
        print("dropped " + test_feature)

dropped Cold War
dropped GameMaker
dropped Lore-Rich
dropped Modern
dropped On-Rails Shooter
dropped Sexual Content


In [117]:
import xgboost as xgb
import numpy as np

data_dmatrix = xgb.DMatrix(data=train_features_data.as_matrix(),label=train_labels_data.as_matrix())

xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.3,
                max_depth = 15, alpha = 10, n_estimators = 30)

xg_reg.fit(train_features_data.as_matrix(),train_labels_data.as_matrix())

preds = xg_reg.predict(train_features_data.as_matrix())

rmse = np.sqrt(mean_squared_error(train_labels_data, preds))
print("RMSE: %f" % (rmse))

predictions = xg_reg.predict(test_features_data.as_matrix())

predictions

  after removing the cwd from sys.path.
  if __name__ == '__main__':


RMSE: 0.050846


  # This is added back by InteractiveShellApp.init_path()
  app.launch_new_instance()


array([ 0.40978628,  7.289542  ,  0.565995  ,  3.854947  ,  8.704012  ,
        1.7966653 ,  2.0399616 ,  0.07647288,  7.282763  , 13.214488  ,
        1.204928  ,  5.6831737 ,  1.4641352 ,  0.6994761 ,  1.9721576 ,
        0.1347959 ,  0.70147103,  2.0801146 ,  1.9808253 ,  6.5152626 ,
        1.0223874 ,  0.78542435,  1.0969248 ,  1.7140564 ,  0.7539133 ,
        1.1415104 ,  1.8233892 ,  5.1672606 ,  3.7001486 ,  1.397624  ,
        8.5148735 , 25.411613  ,  3.2255855 ,  1.8240457 ,  5.770246  ,
        0.93418527,  3.7800803 ,  6.7130213 ,  2.750111  ,  0.26453367,
        7.7229524 ,  2.635842  ,  1.7413142 ,  0.11974877,  2.7805154 ,
        1.1580615 ,  6.5359073 ,  2.701671  ,  0.71244115, 10.6803875 ,
        1.4903682 ,  2.066809  ,  1.0111904 ,  1.5797563 ,  4.128828  ,
        1.8006543 ,  1.4864199 ,  8.097404  ,  0.790539  ,  0.40630394,
        4.815832  ,  1.4414582 ,  1.2577515 ,  5.5719476 ,  0.22056511,
        0.65224123,  1.0242052 ,  0.3891423 , 18.555786  ,  2.22

In [118]:
with open('sample_submission.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['id','playtime_forever'])
    id = 0
    for prediction in predictions:
        spamwriter.writerow([id, prediction])
        id+=1