In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as wn
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
wn.filterwarnings('ignore')

# read the data
train = pd.read_csv(r'train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357 entries, 0 to 356
Data columns (total 11 columns):
id                        357 non-null int64
playtime_forever          357 non-null float64
is_free                   357 non-null bool
price                     357 non-null float64
genres                    357 non-null object
categories                357 non-null object
tags                      357 non-null object
purchase_date             355 non-null object
release_date              357 non-null object
total_positive_reviews    355 non-null float64
total_negative_reviews    355 non-null float64
dtypes: bool(1), float64(4), int64(1), object(5)
memory usage: 28.3+ KB


In [2]:
genres_map = {}
genres_list = []
for row in train['genres']:
    arr = row.split(',')
    for i in arr:
        if i not in genres_map:
            genres_map[i] = 1
            genres_list.append(i)
        else:
            genres_map[i] += 1

genres_map = sorted(genres_map.items(), key = lambda x: x[1], reverse = True)
print(genres_list)
# genres_map
print(len(genres_list))
for item in genres_list:
    train[item] = train['genres'].apply(lambda x: 1 if item in x else 0)
# train.info()

['Adventure', 'Casual', 'Indie', 'RPG', 'Action', 'Strategy', 'Simulation', 'Racing', 'Sports', 'Massively Multiplayer', 'Sexual Content', 'Violent', 'Free to Play', 'Early Access', 'Audio Production', 'Gore', 'Design & Illustration', 'Nudity', 'Animation & Modeling', 'Utilities']
20


In [3]:
# 是否有色情，暴力，血腥等成人元素
def is_adult(x):
    if 'Nudity' in x \
    or 'Sexual Content' in x \
    or 'Gore' in x\
    or 'Violent' in x:
        return 1
    else:
        return 0
    
train['adult_con'] = train['genres'].apply(is_adult)
train['adult_con'].value_counts()

0    350
1      7
Name: adult_con, dtype: int64

In [4]:
import math
def price_clean(x):
    if x == 0:
        return 0
    else:
        return math.log(x)

train['price'] = train['price'].apply(price_clean)
# print(train['price'])
# print(np.max(train['price']))

In [5]:
categories_map = {}
categories_list = []
for row in train['categories']:
    arr = row.split(',')
    for i in arr:
        if i not in categories_map:
            categories_map[i] = 1
            categories_list.append(i)
        else:
            categories_map[i] += 1

categories_map = sorted(categories_map.items(), key = lambda x: x[1], reverse = True)
print(categories_list)
# genres_map
# print(len(categories_list))
for item in categories_list:
    train[item] = train['categories'].apply(lambda x: 1 if item in x else 0)
    
# train.info()

['Single-player', 'Steam Trading Cards', 'Steam Cloud', 'Partial Controller Support', 'Full controller support', 'Multi-player', 'Steam Achievements', 'Steam Workshop', 'Co-op', 'Steam Leaderboards', 'Online Co-op', 'Local Co-op', 'Shared/Split Screen', 'Stats', 'Online Multi-Player', 'Cross-Platform Multiplayer', 'SteamVR Collectibles', 'Local Multi-Player', 'Remote Play on Phone', 'Remote Play on Tablet', 'Remote Play on TV', 'Valve Anti-Cheat enabled', 'Commentary available', 'Captions available', 'Includes level editor', 'In-App Purchases', 'VR Support', 'MMO', 'Includes Source SDK']


In [21]:
tags_map = {}
tags_list = []
for row in train['tags']:
    arr = row.split(',')
    for i in arr:
        if i not in tags_map:
            tags_map[i] = 1
            tags_list.append(i)
        else:
            tags_map[i] += 1

tags_map = sorted(tags_map.items(), key = lambda x: x[1], reverse = True)
# print(genres_list)
# genres_map
print(len(tags_list))
for item in tags_list:
    if item not in train:
        train[item] = train['tags'].apply(lambda x: 1 if item in x else 0)
    else:
        for index, row in train.iterrows():
            if row[item] != 1:
                if item in row['tags']:
                    row[item] = 1
                else:
                    row[item] = 0
            else:
                row[item] = 1
    
# train.info()

Story Rich
Atmospheric
Exploration
First-Person
Stealth
Choices Matter
Singleplayer
Walking Simulator
Short
Drama
Mod
Game Development
Point & Click
Comedy
Funny
Kickstarter
Great Soundtrack
Puzzle
Female Protagonist
2D
Masterpiece
Classic
Retro
Cult Classic
Crowdfunded
Episodic
Sci-fi
Medieval
Open World
Sandbox
Multiplayer
Moddable
Military
Horses
Realistic
Historical
Third Person
Hack and Slash
Fantasy
Tower Defense
Online Co-Op
Third-Person Shooter
Cartoony
Shooter
Controller
Survival
Local Co-Op
Split Screen
Souls-like
Difficult
Resource Management
Top-Down
1980s
Fast-Paced
Pixel Graphics
Psychedelic
Surreal
Addictive
Music
Top-Down Shooter
VR
Turn-Based
Anime
JRPG
Epic
Turn-Based Combat
Parody
Cats
Cute
FPS
Post-apocalyptic
Action RPG
Action-Adventure
Beat 'em up
Metroidvania
Spectacle fighter
3D Platformer
Character Action Game
Platformer
Old School
CRPG
Rogue-like
Dungeon Crawler
Rogue-lite
Replay Value
Side Scroller
Loot
Local Multiplayer
Family Friendly
4 Player Local
Co-op C

In [7]:
train['is_free'] = train['is_free'].apply(lambda x: 1 if x == True else 0)
one_hot_col = categories_list + tags_list + genres_list

In [8]:
def clean_pos(x):
    if x != x:
        return 0
    else:
        return x

def clean_neg(x):
    if x != x or x == 0:
        return 1
    else:
        return x
    
train['total_positive_reviews'] = train['total_positive_reviews'].apply(clean_pos)
train['total_negative_reviews'] = train['total_negative_reviews'].apply(clean_neg)
train['reviews_level'] = train['total_positive_reviews'] - train['total_negative_reviews']
print(train[train['playtime_forever'] > 0]['total_positive_reviews'].mean())
print(train[train['playtime_forever'] > 0]['total_negative_reviews'].mean())
print(train[train['playtime_forever'] == 0]['total_positive_reviews'].mean())
print(train[train['playtime_forever'] == 0]['total_negative_reviews'].mean())

23309.79292929293
5314.247474747474
5258.377358490566
859.6352201257862


In [9]:
# 根据评论反映游戏热度
def is_hot(x):
    if x >= 28000:
        return 2
    elif x >= 10000:
        return 1
    else:
        return 0

train['comment_total'] = train['total_positive_reviews'] + train['total_negative_reviews']
train['hot_level'] = train['comment_total'].apply(is_hot)

In [10]:
# 日期提取
import datetime
import time
date_map = {'Jul':'7', 'Jan':'1', 'Feb':'2', 'Mar':'3', 'Apr':'4', 'May':'5', \
            'Jun':'6', 'Aug':'8', 'Sep':'9', 'Oct':'10', 'Nov':'11', 'Dec':'12'}

def clean_buy_date(x):
    if x != x:
        return -1
    arr = x.split(' ')
    if len(arr) != 3:
        return -1
    return arr[2] + '-' + date_map[arr[0]]
    
def clean_realease_date(x):
    if x != x:
        return -1
    try:
        arr = x.split(',')
        if len(arr) != 2:
            return -1
        mon = -1
        for month in date_map:
            if month in arr[0]:
                mon = date_map[month]
        year = arr[1]
        return year + '-' + str(mon)
    except AttributeError:
        return -1

train['purchase_date_clean'] = train['purchase_date'].apply(clean_buy_date)
train['release_date_clean'] = train['release_date'].apply(clean_realease_date)
train['purchase_year'] = train['purchase_date_clean'].apply(lambda x: -1 if x == -1 else int(x.split('-')[0]))
train['purchase_month'] = train['purchase_date_clean'].apply(lambda x: -1 if x == -1 else int(x.split('-')[1]))
train['release_year'] = train['release_date_clean'].apply(lambda x: -1 if x == -1 else int(x.split('-')[0]))
train['release_month'] = train['release_date_clean'].apply(lambda x: -1 if x == -1 else int(x.split('-')[1]))

# train['purchase_month'].unique()
# plt.figure(figsize=(15, 8))
# month_pivot = train.pivot_table(index="purchase_month", values="playtime_forever", aggfunc=np.mean)
# sns.barplot(x=month_pivot.index, y=month_pivot.playtime_forever)
# plt.show()
def purchase_month_bouns(x):
    if x == 8 or x == 12:
        return 5
    elif x == 1:
        return 4
    elif x == 3 or x == 5 or x == 9 or x == 10:
        return 3
    elif x == -1 or x == 2 or x == 7 or x == 6 or x == 11:
        return 2
    return 1;
train['purchase_m_bonus'] = train['purchase_month'].apply(purchase_month_bouns)
train['purchase_m_bonus'].value_counts()

2    145
3    112
5     49
1     31
4     20
Name: purchase_m_bonus, dtype: int64

In [11]:
def purchase_year_bouns(x):
    if x == 2015 or x == 2017:
        return 4
    elif x == 2016:
        return 3
    elif x == -1 or x == 2018:
        return 2
    return 1;
train['purchase_y_bonus'] = train['purchase_year'].apply(purchase_year_bouns)
train['purchase_y_bonus'].value_counts()


# plt.figure(figsize=(15, 8))
# month_pivot = train.pivot_table(index="purchase_year", values="playtime_forever", aggfunc=np.mean)
# sns.barplot(x=month_pivot.index, y=month_pivot.playtime_forever)
# plt.show()

2    150
4    114
1     67
3     26
Name: purchase_y_bonus, dtype: int64

In [12]:
total_play_time_map = {}
total_play_cnt_map = {}

sum_time = 0
sum_cnt = 0
for buy_time in train['purchase_date_clean'].unique():
    total_play_time_map[buy_time] = train[train['purchase_date_clean'] == buy_time]['playtime_forever'].sum()
    total_play_cnt_map[buy_time] = len(train[train['purchase_date_clean'] == buy_time])
    sum_time += train[train['purchase_date_clean'] == buy_time]['playtime_forever'].sum()
    sum_cnt += len(train[train['purchase_date_clean'] == buy_time])

time_mean = sum_time/len(train['purchase_date_clean'].unique())
cnt_mean = sum_cnt/len(train['purchase_date_clean'].unique())
print(time_mean, cnt_mean)
def month_total_play_time(x):
    if x in total_play_time_map:
        return total_play_time_map[x]
    else:
        return time_mean
    
def month_total_play_time(x):
    if x in total_play_time_map:
        return total_play_cnt_map[x]
    else:
        return cnt_mean

train['month_total_play_time'] = train['purchase_date_clean'].apply(month_total_play_time)
train['month_total_play_cnt'] = train['purchase_date_clean'].apply(month_total_play_time)

28.552991452991453 9.153846153846153


In [13]:
# print(train['month_total_play_cnt'])
# # 游戏总时长统计
# for year in train['purchase_year'].unique():
#     tmp =  train[train['purchase_year'] == year]
#     for month in train['purchase_month'].unique():
#         print(year, '-', month, ': ', tmp[tmp['purchase_month'] == month]['playtime_forever'].value_counts())
#         print(year, '-', month, ': ', tmp[tmp['purchase_month'] == month]['playtime_forever'].mean())
#         print(year, '-', month, ': ', tmp[tmp['purchase_month'] == month]['playtime_forever'].sum())
#         print(' ')

In [14]:
def is_small(a, b):
    arr1 = str(a).split('-')
    arr2 = str(b).split('-')
    y1 = arr1[0]
    m1 = arr1[1]
    y2 = arr2[0]
    m2 = arr2[1]
    if y1 > y2:
        return 0
    elif y1 == y2:
        if m1 > m2:
            return 0
        else:
            return 1
    else:
        return 1
    
train['yugou'] = train.apply(lambda x: is_small(x.purchase_date_clean, x.release_date_clean), axis = 1)
train['yugou'].value_counts()

0    355
1      2
Name: yugou, dtype: int64

In [15]:
train['bonus'] = 0
train['bonus'].value_counts()
# 是否有加分项
def get_bonus(x):
    bou = 0
    if 'Adventure' in x:
        bou += 1
    if 'Indie' in x:
        bou += 1
    if 'RPG' in x:
        bou += 1
    if 'Strategy' in x:
        bou += 1
    if 'Action' in x:
        bou += 1
    if 'Massively Multiplayer' in x:
        bou += 1
    if 'Simulation' in x:
        bou += 1
    return bou
train['bonus'] = train['genres'].apply(get_bonus)
train['bonus'].value_counts()

2    140
3     93
1     81
4     26
5      6
0      6
6      5
Name: bonus, dtype: int64

In [16]:
train['categories_bonus'] = 0
# train['categories_bonus'].value_counts()
cate_bonus = {}
for cate in categories_list:
    tmp = round(train[train['categories'].str.contains(cate)]['playtime_forever'].mean())
    if tmp >= 15:
        cate_bonus[cate] = 3;
    elif tmp >= 5:
        cate_bonus[cate] = 2;
    elif tmp >= 4:
        cate_bonus[cate] = 1;
def categories_bonus(x):
    ca_bonus = 0
    for bn in cate_bonus:
        if bn in x:
            ca_bonus += cate_bonus[bn]
    return ca_bonus

train['categories_bonus'] = train['categories'].apply(categories_bonus)
train['categories_bonus'].value_counts()

3     73
1     53
0     42
2     34
5     28
4     21
7     20
9     15
11    15
10    11
8     10
13     8
6      8
12     6
14     2
16     2
17     2
19     2
23     1
15     1
18     1
21     1
28     1
Name: categories_bonus, dtype: int64

In [133]:
# Survival + Co-Op + 

In [17]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

# final_train = pd.concat([\
#                          train[one_hot_col], \
#                          train[['total_positive_reviews', 'total_negative_reviews', 'reviews_level', 'is_free',\
#                                 'comment_total', 'hot_level', 'price', 'playtime_forever', 'adult_con']]\
#                         ], \
#                          axis=1, join='inner')

final_train = pd.concat([\
                         train[one_hot_col], \
                         train[['total_positive_reviews', 'total_negative_reviews', 'reviews_level', 'is_free',\
                                'comment_total', 'hot_level', 'price', 'playtime_forever', 'adult_con',\
                                'month_total_play_time', 'month_total_play_cnt', 'purchase_year', 'purchase_month',\
                                'release_year', 'release_month', 'bonus','categories_bonus','purchase_y_bonus',\
                                'purchase_m_bonus']]], \
                    axis=1, join='inner')



final_train.to_csv('final_train.csv', index=False)
train_x = final_train.drop(['playtime_forever'], axis=1).as_matrix()
train_y = final_train[['playtime_forever']].as_matrix()

rfr=RandomForestRegressor()
 
rfr.fit(train_x,train_y)   #训练数据


model_save_path = "./Model/train_model.m"
print("HR Model save...")

joblib.dump(rfr, model_save_path)
print("Model has been saved.")

HR Model save...
Model has been saved.


In [18]:
test = pd.read_csv(r'test.csv')
test['total_positive_reviews'] = test['total_positive_reviews'].apply(clean_pos)
test['total_negative_reviews'] = test['total_negative_reviews'].apply(clean_neg)
test['reviews_level'] = test['total_positive_reviews']/test['total_negative_reviews']

for item in genres_list:
    test[item] = test['genres'].apply(lambda x: 1 if item in x else 0)
    
for item in categories_list:
    test[item] = test['categories'].apply(lambda x: 1 if item in x else 0)
    
for item in tags_list:
    if item not in test:
        test[item] = test['tags'].apply(lambda x: 1 if item in x else 0)
    else:
        for index, row in test.iterrows():
            if row[item] != 1:
                if item in row['tags']:
                    row[item] = 1
                else:
                    row[item] = 0
            else:
                row[item] = 1
# 是否有色情，暴力，血腥等成人元素
def is_adult(x):
    if 'Nudity' in x \
    or 'Sexual Content' in x \
    or 'Gore' in x\
    or 'Violent' in x:
        return 1
    else:
        return 0
    
test['adult_con'] = test['genres'].apply(is_adult)
test['adult_con'].value_counts()
                
# 根据评论反映游戏热度
def is_hot(x):
    if x >= 28000:
        return 2
    elif x >= 10000:
        return 1
    else:
        return 0

test['comment_total'] = test['total_positive_reviews'] + test['total_negative_reviews']
test['hot_level'] = test['comment_total'].apply(is_hot)

# final_test = pd.concat([\
#                          test[one_hot_col], \
#                          test[['total_positive_reviews', 'total_negative_reviews', 'reviews_level', 'is_free','price', \
#                               'comment_total', 'hot_level', 'adult_con']]\
#                         ], \
#                          axis=1, join='inner')

test['purchase_date_clean'] = test['purchase_date'].apply(clean_buy_date)
test['release_date_clean'] = test['release_date'].apply(clean_realease_date)
test['purchase_year'] = test['purchase_date_clean'].apply(lambda x: -1 if x == -1 else int(x.split('-')[0]))
test['purchase_month'] = test['purchase_date_clean'].apply(lambda x: -1 if x == -1 else int(x.split('-')[1]))
test['release_year'] = test['release_date_clean'].apply(lambda x: -1 if x == -1 else int(x.split('-')[0]))
test['release_month'] = test['release_date_clean'].apply(lambda x: -1 if x == -1 else int(x.split('-')[1]))
test['purchase_y_bonus'] = test['purchase_year'].apply(purchase_year_bouns)
test['purchase_m_bonus'] = test['purchase_month'].apply(purchase_month_bouns)

print(time_mean, cnt_mean)

def month_total_play_time(x):
    if x in total_play_time_map:
        return total_play_time_map[x]
    else:
        return time_mean
    
def month_total_play_time(x):
    if x in total_play_time_map:
        return total_play_cnt_map[x]
    else:
        return cnt_mean

test['month_total_play_time'] = test['purchase_date_clean'].apply(month_total_play_time)
test['month_total_play_cnt'] = test['purchase_date_clean'].apply(month_total_play_time)

# # 是否提前预购
# def is_small(a, b):
#     arr1 = str(a).split('-')
#     arr2 = str(b).split('-')
#     y1 = arr1[0]
#     m1 = arr1[1]
#     y2 = arr2[0]
#     m2 = arr2[1]
#     if y1 > y2:
#         return 0
#     elif y1 == y2:
#         if m1 > m2:
#             return 0
#         else:
#             return 1
#     else:
#         return 1
    
# test['yugou'] = test.apply(lambda x: is_small(x.purchase_date_clean, x.release_date_clean), axis = 1)
# test['yugou'].value_counts()

test['bonus'] = 0
test['bonus'].value_counts()
# 是否有加分项
def get_bonus(x):
    bou = 0
    if 'Adventure' in x:
        bou += 1
    if 'Indie' in x:
        bou += 1
    if 'RPG' in x:
        bou += 1
    if 'Strategy' in x:
        bou += 1
    if 'Action' in x:
        bou += 1
    if 'Massively Multiplayer' in x:
        bou += 1
    if 'Simulation' in x:
        bou += 1
    return bou
test['bonus'] = test['genres'].apply(get_bonus)
test['bonus'].value_counts()

test['categories_bonus'] = 0
# train['categories_bonus'].value_counts()

def categories_bonus(x):
    ca_bonus = 0
    for bn in cate_bonus:
        if bn in x:
            ca_bonus += cate_bonus[bn]
    return ca_bonus

test['categories_bonus'] = test['categories'].apply(categories_bonus)
test['categories_bonus'].value_counts()


# 加上one-hot编码
for item in genres_list:
    test[item] = test['genres'].apply(lambda x: 1 if item in x else 0)
    
for item in categories_list:
    test[item] = test['categories'].apply(lambda x: 1 if item in x else 0)
    
for item in tags_list:
    if item not in test:
        test[item] = test['tags'].apply(lambda x: 1 if item in x else 0)
    else:
        for index, row in test.iterrows():
            if row[item] != 1:
                if item in row['tags']:
                    row[item] = 1
                else:
                    row[item] = 0
            else:
                row[item] = 1

# final_test = test[['total_positive_reviews','total_negative_reviews', 'reviews_level', 'is_free',\
#                     'comment_total', 'hot_level', 'price', 'adult_con', \
#                     'month_total_play_time', 'month_total_play_cnt', 'purchase_year', 'purchase_month', \
#                     'release_year', 'release_month', 'bonus']]

final_test = pd.concat([\
                         test[one_hot_col], \
                         test[['total_positive_reviews','total_negative_reviews', 'reviews_level', 'is_free',\
                                'comment_total', 'hot_level', 'price', 'adult_con', 'month_total_play_time', \
                               'month_total_play_cnt', 'purchase_year', 'purchase_month', \
                                'release_year', 'release_month', 'bonus','categories_bonus']]], axis=1, join='inner')


test_x = final_test.as_matrix()
test['playtime_forever'] = rfr.predict(test_x)

samplesubmission = pd.read_csv(r'samplesubmission.csv')
samplesubmission['playtime_forever'] = test['playtime_forever']
samplesubmission.to_csv('samplesubmission.csv', index=False)

28.552991452991453 9.153846153846153


ValueError: Number of features of the model must match the input. Model n_features is 379 and input n_features is 377 