In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 12

# use this path to run on kaggle
# train_path = "../input/msbd5001-fall2019/train.csv"
# test_path = "../input/msbd5001-fall2019/test.csv"

# use this path to run on local
train_path = "../data/train.csv"
test_path = "../data/test.csv"

In [None]:
train = pd.read_csv(train_path)

train = train.drop(columns=['id'])
train

In [None]:
test_with_id = pd.read_csv(test_path)
test = test_with_id.drop(columns=['id'])

In [None]:
train['total_positive_reviews'] = train[['total_positive_reviews']].fillna(train[['total_positive_reviews']].mean())
test['total_positive_reviews'] = test[['total_positive_reviews']].fillna(test[['total_positive_reviews']].mean())

train['total_negative_reviews'] = train[['total_negative_reviews']].fillna(train[['total_negative_reviews']].mean())
test['total_negative_reviews'] = test[['total_negative_reviews']].fillna(test[['total_negative_reviews']].mean())

# fill with the mode
train['purchase_date'] = train['purchase_date'].fillna("Jun 27, 2019")
test['purchase_date'] = test['purchase_date'].fillna("Oct 25, 2017")


In [None]:
# feature engineering: calculate waiting days and own days
train["waiting_days"] = pd.to_datetime(train['purchase_date']) - pd.to_datetime(train['release_date'])
test["waiting_days"] = pd.to_datetime(test['purchase_date']) - pd.to_datetime(test['release_date'])
train["own_days"] =  pd.to_datetime('Dec 25, 2019')- pd.to_datetime(train['purchase_date'])
test["own_days"] = pd.to_datetime('Dec 25, 2019')- pd.to_datetime(test['purchase_date'])

train["waiting_days"] = pd.to_timedelta(train["waiting_days"]).dt.days
test["waiting_days"] = pd.to_timedelta(test["waiting_days"]).dt.days
train["own_days"] = pd.to_timedelta(train["own_days"]).dt.days
test["own_days"] = pd.to_timedelta(test["own_days"]).dt.days

# because of the mode to fillna, some days maybe illegal, set them to 0
train.waiting_days[train["waiting_days"] < 0] = 0
test.waiting_days[test["waiting_days"] < 0] = 0
train.waiting_days[train["own_days"] < 0] = 0
test.waiting_days[test["own_days"] < 0] = 0

In [None]:
train

In [None]:
# for bool
train['is_free'] = train['is_free'].astype(int)
test['is_free'] = test['is_free'].astype(int)

In [None]:
drop_columns = ['purchase_date', 'release_date']

train_rough = train.drop(columns=drop_columns)
test_rough = test.drop(columns=drop_columns)

In [None]:
train_rough.drop(train_rough.price.nlargest(2).index, inplace=True)

In [None]:
y = train_rough['playtime_forever']

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(train_rough, y, random_state = 0, test_size=0.2)

In [None]:
train_X

In [None]:
# split the words 
generes_list = train_X['genres'].str.split(',',expand=False).to_numpy()
categories_list = train_X['categories'].str.split(',',expand=False).to_numpy()
tags_list = train_X['tags'].str.split(',',expand=False).to_numpy()

generes_set = {item for i in generes_list for item in i}
categories_set = {item for i in categories_list for item in i}
tags_set = {item for i in tags_list for item in i}

generes_time_dict = {}
categories_time_dict = {}
tags_time_dict = {}

# generate the mean of time corresponding to different tags
for genere in generes_set:
    count = np.sum(train_X['genres'].str.contains(genere))
    generes_time_dict[genere] = np.sum(train_X[train_X['genres'].str.contains(genere)].playtime_forever) / count

for category in categories_set:
    count = np.sum(train_X['categories'].str.contains(category))
    categories_time_dict[category] = np.sum(train_X[train_X['categories'].str.contains(category)].playtime_forever) / count

for tag in tags_set:
    count = np.sum(train_X['tags'].str.contains(tag))
    tags_time_dict[tag] = np.sum(train_X[train_X['tags'].str.contains(tag)].playtime_forever) / count


In [None]:
def find_mean(current_dict, current_str):
    str_list = current_str.split(',')
    num_list = [0]
    for item in str_list:
        if item in current_dict:
            num_list.append(current_dict[item])
    return np.mean(num_list)

In [None]:
generes_time_dict

In [None]:
train_X['genres_time_mean'] = train_X['genres'].apply(lambda x: find_mean(generes_time_dict, x))
train_X['categories_time_mean'] = train_X['categories'].apply(lambda x: find_mean(categories_time_dict, x))
train_X['tags_time_mean'] = train_X['tags'].apply(lambda x: find_mean(tags_time_dict, x))

val_X['genres_time_mean'] = val_X['genres'].apply(lambda x: find_mean(generes_time_dict, x))
val_X['categories_time_mean'] = val_X['categories'].apply(lambda x: find_mean(categories_time_dict, x))
val_X['tags_time_mean'] = val_X['tags'].apply(lambda x: find_mean(tags_time_dict, x))

test_X = test_rough
test_X['genres_time_mean'] = test_rough['genres'].apply(lambda x: find_mean(generes_time_dict, x))
test_X['categories_time_mean'] = test_rough['categories'].apply(lambda x: find_mean(categories_time_dict, x))
test_X['tags_time_mean'] = test_rough['tags'].apply(lambda x: find_mean(tags_time_dict, x))

In [None]:
train_X['categories_count'] = train_X['categories'].apply(lambda x: len(x.split(',')))
val_X['categories_count'] = val_X['categories'].apply(lambda x: len(x.split(',')))
test_X['categories_count'] = test_X['categories'].apply(lambda x: len(x.split(',')))

train_X['tags_count'] = train_X['tags'].apply(lambda x: len(x.split(',')))
val_X['tags_count'] = val_X['tags'].apply(lambda x: len(x.split(',')))
test_X['tags_count'] = test_X['tags'].apply(lambda x: len(x.split(',')))

train_X['genres_count'] = train_X['genres'].apply(lambda x: len(x.split(',')))
val_X['genres_count'] = val_X['genres'].apply(lambda x: len(x.split(',')))
test_X['genres_count'] = test_X['genres'].apply(lambda x: len(x.split(',')))

In [None]:
train_X = train_X.select_dtypes(exclude=['object'])
val_X = val_X.select_dtypes(exclude=['object'])
test_X = test_X.select_dtypes(exclude=['object'])

In [None]:
train_X.describe()

In [None]:
test_X.describe()

In [None]:
train_X = train_X.drop(columns=['playtime_forever'])
val_X = val_X.drop(columns=['playtime_forever'])

In [None]:
# norm it
train_x_norm = (train_X - train_X.min()) / (train_X.max() - train_X.min())
val_x_norm = (val_X - val_X.min()) / (val_X.max() - val_X.min())
test_x_norm = (test_X - test_X.min()) / (test_X.max() - test_X.min())

In [None]:
test_X.describe()

In [None]:
test_x_norm

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

knn_model = KNeighborsRegressor(n_neighbors=20, 
                                leaf_size=15, 
                                p=4)
knn_model.fit(train_x_norm, train_y)
knn_rough_preds = knn_model.predict(train_x_norm)
print(sqrt(mean_squared_error(train_y, knn_rough_preds)))
knn_rough_preds = knn_model.predict(val_x_norm)
print(sqrt(mean_squared_error(val_y, knn_rough_preds)))

In [None]:
test_rough_predict = knn_model.predict(test_x_norm)
output = pd.DataFrame(pd.DataFrame({
        "id": test_with_id.id,
        "playtime_forever": test_rough_predict
    }))


output.to_csv("knn_submission.csv", index = False)