In [503]:
import csv

#Standard data-sci libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [504]:
#SKLearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesClassifier, BaggingRegressor, GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, HuberRegressor, Lars, LassoLars,  ElasticNet, PassiveAggressiveRegressor, RANSACRegressor, SGDRegressor, TheilSenRegressor
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn import metrics
from sklearn.svm import SVR

#XGBoost
import xgboost as xgb

In [509]:
#Dropping rows with outliers
def clean_outliers(df, col):
    iqr = df[col].quantile(0.75) - df[col].quantile(0.25)
    tolerance_threshold = 2
    low  = df[col].quantile(0.25) - tolerance_threshold*iqr
    high = df[col].quantile(0.75) + tolerance_threshold*iqr
    df_cleaned = df.loc[(df[col] > low) & (df[col] < high)]
    return df_cleaned

def rmse(a,b):
    return metrics.mean_squared_error(a,b)**0.5

In [510]:
train_data = ("../train.csv")
test_data = ("../test.csv")


In [511]:
#Train data
df = pd.read_csv(train_data)

In [512]:
#cleaning and preprocessing train data
df = pd.read_csv(train_data)

#remove columns with more than a certain proprotion of missing values
missing_value_proportion = 0.75
df = df[df.columns[df.isnull().mean() < missing_value_proportion]]
df = df.loc[df.isnull().mean(axis=1) < missing_value_proportion]

#drop rows with NaNs
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

#replace booleans with ints
df.replace({False: 0, True: 1}, inplace=True)

df["purchase_date"] = pd.to_datetime(df["purchase_date"])
df["release_date"] = pd.to_datetime(df["release_date"]) 

df['release_year'] = pd.DatetimeIndex(df['release_date']).year
df['release_month'] = pd.DatetimeIndex(df['release_date']).month
df['release_day'] = pd.DatetimeIndex(df['release_date']).day
df['release_weekday'] = pd.DatetimeIndex(df['release_date']).dayofweek
df['purchase_year'] = pd.DatetimeIndex(df['purchase_date']).year
df['purchase_month'] = pd.DatetimeIndex(df['purchase_date']).month
df['purchase_day'] = pd.DatetimeIndex(df['purchase_date']).day
df['purchase_weekday'] = pd.DatetimeIndex(df['purchase_date']).dayofweek

df["purchase_release_diff"] = df.apply(lambda a: (a["purchase_date"] - a["release_date"]).days, axis=1)

df["purchase_release_diff_year"] = df["purchase_date"].dt.year - df["release_date"].dt.year

df.drop(columns=["purchase_date", "release_date"], inplace=True)

df["positive_ratio"] = df.apply(lambda a: a["total_positive_reviews"]\
                                               /(a["total_negative_reviews"] + a["total_positive_reviews"])\
                                               if a["total_negative_reviews"] + a["total_positive_reviews"] != 0\
                                               else 0.5, axis=1)

df["negative_ratio"] = df.apply(lambda a: a["total_negative_reviews"]\
                                               /(a["total_negative_reviews"] + a["total_positive_reviews"])\
                                               if a["total_negative_reviews"] + a["total_positive_reviews"] != 0\
                                               else 0.5, axis=1)

df["total_reviews"] = df.apply(lambda a: (a["total_negative_reviews"] + a["total_positive_reviews"]),\
                                               axis=1)

df.drop(columns=["total_positive_reviews", "total_negative_reviews"], inplace=True)

genres = df["genres"].str.get_dummies(",")
genres.columns = ['genre_' + str(col) for col in genres.columns]
categories = df["categories"].str.get_dummies(",")
categories.columns = ['category_' + str(col) for col in categories.columns]
tags = df["tags"].str.get_dummies(",")
tags.columns = ['tag_' + str(col) for col in tags.columns]
preprocessed_df = pd.concat([df, genres, categories, tags], axis=1)

preprocessed_df.drop(columns=["id", "is_free", "genres", "categories", "tags"], inplace=True)

preprocessed_df[["total_reviews", "price"]] = preprocessed_df[["total_reviews", "price"]].astype("int")

preprocessed_df = clean_outliers(preprocessed_df,'price')
preprocessed_df = preprocessed_df[preprocessed_df['playtime_forever'] < 60]
preprocessed_df.reset_index(inplace=True, drop=True)

In [513]:
#Test data
test_df = pd.read_csv(test_data, parse_dates = ['purchase_date', 'release_date'])

In [514]:
#cleaning and preprocessing test data
test_df["purchase_date"] = pd.to_datetime(test_df["purchase_date"])
test_df["release_date"] = pd.to_datetime(test_df["release_date"])

test_df['release_year'] = pd.DatetimeIndex(test_df['release_date']).year
test_df['release_month'] = pd.DatetimeIndex(test_df['release_date']).month
test_df['release_day'] = pd.DatetimeIndex(test_df['release_date']).day
test_df['release_weekday'] = pd.DatetimeIndex(test_df['release_date']).dayofweek
test_df['purchase_year'] = pd.DatetimeIndex(test_df['purchase_date']).year
test_df['purchase_month'] = pd.DatetimeIndex(test_df['purchase_date']).month
test_df['purchase_day'] = pd.DatetimeIndex(test_df['purchase_date']).day
test_df['purchase_weekday'] = pd.DatetimeIndex(test_df['purchase_date']).dayofweek

test_df["purchase_release_diff"] = test_df.apply(lambda a: (a["purchase_date"] - a["release_date"]).days, axis=1)

test_df["purchase_release_diff_years"] = test_df["purchase_date"].dt.year - test_df["release_date"].dt.year

test_df.drop(columns=["purchase_date", "release_date"], inplace=True)

test_df["purchase_release_diff_years"].fillna(test_df["purchase_release_diff_years"].median(), inplace=True)
test_df["purchase_release_diff"].fillna(test_df["purchase_release_diff"].median(), inplace=True)
test_df["total_positive_reviews"].fillna(test_df["total_positive_reviews"].median(), inplace=True)
test_df["total_negative_reviews"].fillna(test_df["total_negative_reviews"].median(), inplace=True)

test_df.reset_index(drop=True, inplace=True)

test_df.replace({False: 0, True: 1}, inplace=True)

test_df["positive_ratio"] = test_df.apply(lambda a: a["total_positive_reviews"]\
                                               /(a["total_negative_reviews"] + a["total_positive_reviews"])\
                                               if a["total_negative_reviews"] + a["total_positive_reviews"] != 0\
                                               else 0.5, axis=1)


test_df["negative_ratio"] = test_df.apply(lambda a: a["total_negative_reviews"]\
                                               /(a["total_negative_reviews"] + a["total_positive_reviews"])\
                                               if a["total_negative_reviews"] + a["total_positive_reviews"] != 0\
                                               else 0.5, axis=1)

test_df["total_reviews"] = test_df.apply(lambda a: (a["total_negative_reviews"] + a["total_positive_reviews"]),\
                                               axis=1)

test_df.drop(columns=["total_positive_reviews", "total_negative_reviews"], inplace=True)

genres_test = test_df["genres"].str.get_dummies(",")
genres_test.columns = ['genre_' + str(col) for col in genres_test.columns]
categories_test = test_df["categories"].str.get_dummies(",")
categories_test.columns = ['category_' + str(col) for col in categories_test.columns]
tags_test = test_df["tags"].str.get_dummies(",")
tags_test.columns = ['tag_' + str(col) for col in tags_test.columns]
preprocessed_test_df = pd.concat([test_df, genres_test, categories_test, tags_test], axis=1)

preprocessed_test_df.drop(columns=["id","is_free", "genres", "categories", "tags"], inplace=True)

preprocessed_test_df[["total_reviews", "price"]] = preprocessed_test_df[["total_reviews", "price"]].astype("int")

In [515]:
train_x = preprocessed_df.drop(['playtime_forever'], axis=1)
train_y = preprocessed_df['playtime_forever']
test_x = preprocessed_test_df

In [516]:
test_features = test_x.columns.tolist()
for train_feature in train_x.columns.tolist():
    if train_feature not in test_features:
        test_x[train_feature] = 0

train_features = train_x.columns.tolist()
for test_feature in test_x.columns.tolist():
    if test_feature not in train_features:
        test_x.drop(columns=[test_feature],inplace=True)

In [517]:
test_x.fillna(test_x.median(), inplace=True)


In [518]:
print(train_x.shape,test_x.shape,train_y.shape)

(339, 375) (90, 375) (339,)


In [519]:
# Splitting the dataset

X_train, X_test, Y_train, Y_test = train_test_split(train_x,train_y, test_size=0.2,random_state=0)

In [520]:
print(X_train.shape,X_test.shape,Y_test.shape,Y_train.shape)


(271, 375) (68, 375) (68,) (271,)


In [521]:
mm_scaler = MinMaxScaler()
X_train = mm_scaler.fit_transform(X_train)
X_test = mm_scaler.fit_transform(X_test)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [522]:
def allmodels():
    classifiers = [
    AdaBoostRegressor(),
    BaggingRegressor(),
    ExtraTreesRegressor(),
    GradientBoostingRegressor(),
    RandomForestRegressor(),
    PassiveAggressiveRegressor(),
    SGDRegressor(),
    TheilSenRegressor(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(),
    ExtraTreeRegressor()
]
    names = [
    "AdaBoostRegressor",
    "BaggingRegressor",
    "ExtraTreesRegressor",
    "GradientBoostingRegressor",
    "RandomForestRegressor",
    "PassiveAggressiveRegressor",
    "SGDRegressor",
    "TheilSenRegressor",
    "KNeighborsRegressor",
    "DecisionTreeRegressor",
    "ExtraTreeRegressor"
]
    return classifiers,names
classifiers,names=allmodels()

In [523]:
epochs=10
for i in range(epochs):  
    result=[]
    for classifier,name in zip(classifiers,names):
        classifier.fit(X_train, Y_train)
        a=classifier.predict(X_test)
        a[a<0]=0
        result.append(rmse(a,Y_test))
    model_result=pd.DataFrame(data=result,index=names,columns=['rmse']).sort_values(by="rmse" , ascending=True)
    print(model_result)



                                rmse
SGDRegressor                2.688517
KNeighborsRegressor         2.964260
ExtraTreesRegressor         3.359611
AdaBoostRegressor           3.440684
ExtraTreeRegressor          4.369051
BaggingRegressor            4.541529
RandomForestRegressor       4.975691
PassiveAggressiveRegressor  4.981368
GradientBoostingRegressor   5.176601
TheilSenRegressor           7.713475
DecisionTreeRegressor       9.670079




                                rmse
SGDRegressor                2.661362
KNeighborsRegressor         2.964260
ExtraTreesRegressor         3.335569
PassiveAggressiveRegressor  3.578486
AdaBoostRegressor           3.883143
ExtraTreeRegressor          4.573147
BaggingRegressor            4.761577
GradientBoostingRegressor   4.870869
RandomForestRegressor       5.004533
DecisionTreeRegressor       5.030778
TheilSenRegressor           7.713475




                                 rmse
SGDRegressor                 2.732212
ExtraTreesRegressor          2.871664
KNeighborsRegressor          2.964260
AdaBoostRegressor            3.500330
ExtraTreeRegressor           3.947558
BaggingRegressor             4.072061
GradientBoostingRegressor    4.830671
PassiveAggressiveRegressor   5.001525
RandomForestRegressor        5.981867
TheilSenRegressor            7.713475
DecisionTreeRegressor       11.260459




                                rmse
SGDRegressor                2.490569
KNeighborsRegressor         2.964260
ExtraTreesRegressor         3.169299
AdaBoostRegressor           3.579392
ExtraTreeRegressor          4.000032
PassiveAggressiveRegressor  4.318926
BaggingRegressor            4.571840
RandomForestRegressor       4.617190
GradientBoostingRegressor   4.951320
TheilSenRegressor           7.713475
DecisionTreeRegressor       9.733892




                                rmse
SGDRegressor                2.514484
KNeighborsRegressor         2.964260
AdaBoostRegressor           3.751924
ExtraTreesRegressor         3.796040
PassiveAggressiveRegressor  4.211945
RandomForestRegressor       4.472590
GradientBoostingRegressor   4.755455
ExtraTreeRegressor          4.916343
BaggingRegressor            5.580810
TheilSenRegressor           7.713475
DecisionTreeRegressor       7.964250




                                rmse
SGDRegressor                2.379917
KNeighborsRegressor         2.964260
AdaBoostRegressor           3.467721
ExtraTreesRegressor         3.892705
GradientBoostingRegressor   4.390085
RandomForestRegressor       4.418415
ExtraTreeRegressor          5.139208
BaggingRegressor            5.181911
PassiveAggressiveRegressor  5.202461
TheilSenRegressor           7.713475
DecisionTreeRegressor       8.873322




                                rmse
SGDRegressor                2.515214
KNeighborsRegressor         2.964260
ExtraTreesRegressor         3.268196
AdaBoostRegressor           3.368789
ExtraTreeRegressor          3.785864
RandomForestRegressor       5.281030
GradientBoostingRegressor   5.444391
BaggingRegressor            5.691904
PassiveAggressiveRegressor  6.621171
TheilSenRegressor           7.713475
DecisionTreeRegressor       9.160586




                                rmse
SGDRegressor                2.802681
KNeighborsRegressor         2.964260
ExtraTreesRegressor         3.555549
AdaBoostRegressor           3.565468
ExtraTreeRegressor          3.616393
PassiveAggressiveRegressor  3.713284
BaggingRegressor            4.606772
RandomForestRegressor       4.808504
GradientBoostingRegressor   5.197594
DecisionTreeRegressor       5.903666
TheilSenRegressor           7.713475




                                rmse
SGDRegressor                2.841485
KNeighborsRegressor         2.964260
AdaBoostRegressor           3.444929
ExtraTreesRegressor         3.455291
BaggingRegressor            4.075843
GradientBoostingRegressor   4.388394
RandomForestRegressor       4.813466
DecisionTreeRegressor       4.834594
ExtraTreeRegressor          4.890253
TheilSenRegressor           7.713475
PassiveAggressiveRegressor  7.939656
                                rmse
SGDRegressor                2.643687
KNeighborsRegressor         2.964260
PassiveAggressiveRegressor  3.466936
ExtraTreesRegressor         3.641744
AdaBoostRegressor           3.694199
RandomForestRegressor       4.514321
ExtraTreeRegressor          4.559779
BaggingRegressor            4.669502
GradientBoostingRegressor   4.789215
DecisionTreeRegressor       4.903349
TheilSenRegressor           7.713475




In [525]:
xgboost_model = xgb.XGBRegressor(learning_rate=0.05
                                 , max_depth=12, n_estimators=10, alpha=10, objective ='reg:linear', colsample_bytree = 0.3
                                )
xgboost_model.fit(X_train,Y_train)
y_pred = xgboost_model.predict(X_test)
print('rmse',mean_squared_error(Y_test,y_pred)**0.5)

rmse 2.0664659783768795


  if getattr(data, 'base', None) is not None and \


In [526]:
xgb_predictions = xgboost_model.predict(test_x.as_matrix())
xgb_predictions[xgb_predictions<0]=0
xgb_predictions

  """Entry point for launching an IPython kernel.


array([0.52853477, 0.5889439 , 0.71982557, 0.52202225, 0.863444  ,
       1.0173299 , 1.0812002 , 0.63105184, 0.9593348 , 0.6659932 ,
       3.3110316 , 2.5654776 , 2.0610678 , 1.539451  , 2.6463323 ,
       1.8040519 , 0.87598026, 0.70934516, 0.9403275 , 0.63006604,
       0.90972054, 0.890044  , 0.58369195, 0.7236235 , 0.7236235 ,
       0.7385099 , 0.86111873, 0.74947464, 1.0979857 , 0.9506979 ,
       1.4629277 , 0.9411022 , 2.64835   , 0.7889987 , 0.96297383,
       0.7485833 , 0.67181635, 0.83631015, 1.0519717 , 2.8399775 ,
       0.79259014, 0.7398387 , 1.0070097 , 0.67181635, 1.1784291 ,
       2.9788709 , 0.6509014 , 1.0296044 , 2.561041  , 1.7969348 ,
       0.91622424, 0.70863783, 0.51809007, 1.037403  , 0.94044626,
       1.9613305 , 2.8308995 , 0.96630543, 1.4831593 , 0.74061334,
       1.0239747 , 1.478186  , 0.7554635 , 0.8022088 , 0.67181635,
       0.9347919 , 3.3079462 , 1.0233188 , 1.5981816 , 0.7754525 ,
       0.70863783, 1.3884393 , 1.0173299 , 2.8952239 , 0.75546

In [527]:
print(train_y.mean(), xgb_predictions.mean())

1.9119469026637157 1.1919639


In [528]:
classifier=KNeighborsRegressor()
classifier.fit(X_train,Y_train)
predictions=classifier.predict(X_test)
predictions[predictions<0]=0
result.append(rmse(predictions,Y_test))

In [529]:
#mm_scaler.fit_transform(test_x)
knn_predictions=classifier.predict(test_x)
knn_predictions[knn_predictions<0]=0
knn_predictions

array([2.55666667, 2.55666667, 2.45666667, 1.50333333, 0.17      ,
       2.55666667, 2.55666667, 0.        , 2.55666667, 0.        ,
       2.22      , 1.60333333, 2.55666667, 0.17      , 0.17      ,
       0.84333333, 2.55666667, 2.55666667, 2.55666667, 2.55666667,
       1.60333333, 2.55666667, 2.55666667, 2.55666667, 1.60333333,
       2.45666667, 2.22      , 2.55666667, 2.55666667, 2.55666667,
       2.55666667, 0.17      , 2.55666667, 2.55666667, 1.67333333,
       2.55666667, 0.17      , 0.17      , 2.55666667, 0.        ,
       0.17      , 0.17      , 2.41666667, 2.55666667, 2.55666667,
       2.55666667, 2.55666667, 1.60333333, 0.17      , 0.20333333,
       0.44      , 0.17      , 2.55666667, 2.55666667, 0.17      ,
       1.60333333, 0.17      , 2.45666667, 0.44      , 2.55666667,
       2.55666667, 2.55666667, 1.60333333, 1.77333333, 1.60333333,
       2.55666667, 0.44      , 0.95333333, 0.95333333, 1.60333333,
       0.95333333, 0.17      , 0.17      , 0.17      , 2.55666

In [530]:
print(train_y.mean(),knn_predictions.mean())

1.9119469026637157 1.5514444445155557


In [531]:
classifier=SGDRegressor()
classifier.fit(X_train,Y_train)
predictions=classifier.predict(X_test)
predictions[predictions<0]=0
result.append(rmse(predictions,Y_test))
print(rmse(predictions,Y_test))
print(predictions)

2.3734287894361112
[4.27530163 5.6433622  5.19072769 0.         2.89905499 1.76906698
 4.66190683 4.52551112 2.93863559 5.36440687 1.7899387  1.57734231
 0.         0.48691779 0.48761997 0.         0.         3.53199547
 0.         5.41290869 1.37809278 4.0977998  5.14179296 1.98220272
 0.16746216 0.63185649 1.4324926  0.         0.         0.8529277
 0.84558243 1.85093784 2.82895046 0.         0.         3.5886966
 0.         0.72923574 2.5515111  0.         0.81256729 4.01665507
 0.         4.69670655 1.43475714 0.52248278 0.         4.05910213
 0.36042722 0.92555129 0.         0.12252747 0.         3.17670482
 0.         0.16177152 2.42195845 1.21136212 2.55878774 0.
 0.         2.57662833 0.         0.         0.         0.
 3.12772154 0.        ]




In [532]:
mm_scaler.fit_transform(test_x)
sgd_predictions=classifier.predict(test_x.as_matrix())
#sgd_predictions[sgd_predictions<0]=0
sgd_predictions

  return self.partial_fit(X, y)
  


array([-2715.24199827, -3022.41982244, -2143.40307537, -2945.78823108,
       -4029.32886674, -2287.02233826, -2474.21511097, -2751.90050381,
       -3005.78495302, -2382.52944158, -2765.91846727, -3213.88652673,
       -2365.03070353, -2729.20081335, -3512.30595261, -1865.55673621,
       -2168.66534324, -2365.71581621, -2349.75885206, -2613.40045087,
       -2479.2340391 , -2906.85125775, -3145.13963465, -2475.52806218,
       -2077.49893002, -1967.72826352, -2723.00774885, -2453.25042539,
       -2981.31968391, -2655.81503028, -2780.83799735, -8851.81491048,
       -2486.07300318, -2430.37267482, -3140.67140508, -2794.99712692,
       -2621.80436391, -4095.82761283, -2337.53236028, -2735.96048473,
       -3783.93224287, -3120.38623765, -3953.5015825 , -2151.45545208,
       -2454.93077459, -2524.08757007, -2758.55685931, -2160.68593934,
       -3642.28232022, -2473.59866386, -1778.74187958, -3229.44468693,
       -3409.74014751, -2618.20971823, -2826.18531407, -2157.74592835,
      

In [533]:
def rfr_model(X, y, max_depth_gsc):
    # Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': range(3, max_depth_gsc),
            'n_estimators': (10, 50, 100, 200, 500),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_

    rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"],
                                random_state=False, verbose=False, max_features="sqrt")

    rfr.fit(X, y)

    return rfr

In [534]:
max_depth_gsc = 15

rfr = rfr_model(train_x, train_y, max_depth_gsc)

rfr_predictions = rfr.predict(test_x)
rfr_predictions[rfr_predictions<0]=0
rfr_predictions



array([1.46255235, 1.56689362, 1.32813173, 1.41136458, 1.84347705,
       2.14151231, 2.22229841, 1.35620486, 3.42834337, 2.12095056,
       3.62730313, 4.8261431 , 2.03698355, 1.63641687, 2.40290283,
       1.59215923, 2.91935136, 2.79419979, 2.82282914, 2.11528737,
       2.27773331, 1.62281789, 1.80087146, 1.48966172, 1.44023693,
       1.88797679, 1.39939021, 1.91970438, 2.60255404, 1.46375733,
       1.84524866, 1.8046242 , 1.58237135, 1.85412552, 1.69640837,
       1.51737939, 1.84013821, 2.39962972, 2.82282914, 3.65001915,
       1.55280201, 2.3993972 , 3.20018493, 1.46881103, 2.62263262,
       1.58321151, 1.66594897, 2.32079363, 3.23447833, 1.92341628,
       2.40411676, 1.31327391, 1.92688841, 2.75703048, 2.13799069,
       2.53719379, 2.34081226, 1.77890513, 1.43065523, 1.35677358,
       1.56938366, 1.4759314 , 1.42715525, 3.54614433, 1.35266981,
       1.84470466, 1.69743791, 2.06972191, 1.90419992, 1.32973118,
       2.66807116, 1.54692108, 1.80275574, 2.8775408 , 1.86691

In [535]:
print(train_y.mean(),rfr_predictions.mean())

1.9119469026637157 2.0643455595724225


In [536]:
predictions=xgb_predictions

with open('sample_submission.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['id','playtime_forever'])
    id = 0
    for prediction in predictions:
        spamwriter.writerow([id, prediction])
        id+=1