In [84]:
import pandas as pd
import numpy as np
from sklearn import tree, metrics, linear_model
from sklearn.model_selection import train_test_split
from sklearn.tree.export import export_text
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

In [85]:
def classify(x):
    if x<0.05:
        return 0
    elif x<10:
        return 1
    else:
        return 2

In [86]:
def preprocess(filename):
    df = pd.read_csv(filename,parse_dates = ['purchase_date'])
    df['play'] = df.playtime_forever.apply(classify)
    df.eval('total_reviews = total_positive_reviews + total_negative_reviews ' , inplace=True)
    df.eval('positive_review_rate = total_positive_reviews / total_reviews ' , inplace=True)
    df['release_date'] = pd.to_datetime(df['release_date'])
    def extract_data(df, column):
        df[column+"year"] = df[column].apply(lambda x:x.year)
        df[column+'month'] = df[column].apply(lambda x: x.month)
    extract_data(df, 'purchase_date')
    extract_data(df, 'release_date')
    df['time_span'] = df.purchase_date-df.release_date
    missing_df = df.isnull().sum(axis=0).reset_index()
    missing_df.columns = ['variable', 'missing values']
    missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
    missing_df.sort_values('filling factor (%)').reset_index(drop = True)
    df.fillna(1,inplace = True)
    df['time_span'] = df['time_span'].astype('timedelta64[D]').astype(int) 
    c = df["genres"].str.get_dummies(",") 
    df = pd.concat([df,c],axis=1,ignore_index=False)
    return df

In [87]:
df = preprocess('train.csv')
list_train = df.columns.values.tolist()
print(list_train)

['id', 'playtime_forever', 'is_free', 'price', 'genres', 'categories', 'tags', 'purchase_date', 'release_date', 'total_positive_reviews', 'total_negative_reviews', 'play', 'total_reviews', 'positive_review_rate', 'purchase_dateyear', 'purchase_datemonth', 'release_dateyear', 'release_datemonth', 'time_span', 'Action', 'Adventure', 'Animation & Modeling', 'Audio Production', 'Casual', 'Design & Illustration', 'Early Access', 'Free to Play', 'Gore', 'Indie', 'Massively Multiplayer', 'Nudity', 'RPG', 'Racing', 'Sexual Content', 'Simulation', 'Sports', 'Strategy', 'Utilities', 'Violent']




In [88]:
# list_train = df.columns.values.tolist()
del list_train[0:2],list_train[2:10]

In [89]:
X = np.array(df[list_train])
Y = np.array(df['play'])
Y = Y.reshape(len(Y), 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [90]:
m = 0
for i in range(1,10):
    for j in range(1,10):
        clf = tree.DecisionTreeClassifier(max_depth=i,max_features = j,random_state=1)
        clf = clf.fit(X_train, Y_train)
        result = clf.predict(X_test)
#        print(result)
#        print(Y_test)
        s = 0
        for k in range(len(result)):
            if result[k] == Y_test[k]:
                s+=1
        r = s/len(result)
#        print(r)
        if r>m:
            m = r
            i_best = i
            j_best = j
print(m)
print(i_best)
print(j_best)

0.6296296296296297
6
7


In [91]:
clf = tree.DecisionTreeClassifier(max_depth=i_best,max_features = j_best,random_state=1)
clf = clf.fit(X_train, Y_train)
result = clf.predict(X_test)

In [92]:
r = export_text(clf, feature_names=list_train)
print(r)

|--- total_reviews <= 4486.00
|   |--- RPG <= 0.50
|   |   |--- purchase_dateyear <= 2017.50
|   |   |   |--- Adventure <= 0.50
|   |   |   |   |--- purchase_dateyear <= 2015.50
|   |   |   |   |   |--- release_datemonth <= 9.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- release_datemonth >  9.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- purchase_dateyear >  2015.50
|   |   |   |   |   |--- release_dateyear <= 2013.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- release_dateyear >  2013.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- Adventure >  0.50
|   |   |   |   |--- time_span <= 1484.50
|   |   |   |   |   |--- positive_review_rate <= 0.76
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- positive_review_rate >  0.76
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- time_span >  1484.50
|   |   |   |   |   |--- class: 0
|   |   |--- purchase_dateyear >  2017.50
|   |   |   |--- Simulation <= 0.

In [93]:
s = 0
for i in range(len(result)):
    if result[i] == Y_test[i]:
        s+=1
print(s/len(result))

0.6296296296296297


In [94]:
df_new1 = df.loc[df["play"] == 1]
# c = df_new1["genres"].str.get_dummies(",") 
# df_new1 = pd.concat([df_new1,c],axis=1,ignore_index=False)
# list = df_new1.columns.values.tolist()
# del list[0:2],list[2:10]

In [95]:
X = np.array(df_new1[list_train])
Y = np.array(df_new1['playtime_forever'])
Y = Y.reshape(len(Y), 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
score_min = 10000
for pol_order in range(1, 3):
    for alpha in range(0, 20, 2):
        ridgereg = Ridge(alpha = alpha/10, normalize=True)
        poly = PolynomialFeatures(degree = pol_order)
        regr = linear_model.LinearRegression()
        X_ = poly.fit_transform(X_train)
        ridgereg.fit(X_, Y_train)
        
        X_ = poly.fit_transform(X_test)
        result = ridgereg.predict(X_)
        score = metrics.mean_squared_error(result, Y_test)
        
        if score < score_min:
            score_min = score
            parameters = [alpha, pol_order]

        print("n={} alpha={} , MSE = {:<0.5}".format(pol_order, alpha/10, score))



n=1 alpha=0.0 , MSE = 2.7456e+05
n=1 alpha=0.2 , MSE = 7513.7
n=1 alpha=0.4 , MSE = 4834.4
n=1 alpha=0.6 , MSE = 3483.8
n=1 alpha=0.8 , MSE = 2678.0
n=1 alpha=1.0 , MSE = 2145.9
n=1 alpha=1.2 , MSE = 1770.2
n=1 alpha=1.4 , MSE = 1491.9
n=1 alpha=1.6 , MSE = 1278.6
n=1 alpha=1.8 , MSE = 1110.4
n=2 alpha=0.0 , MSE = 2.1712e+06
n=2 alpha=0.2 , MSE = 3591.1
n=2 alpha=0.4 , MSE = 2482.8
n=2 alpha=0.6 , MSE = 1927.8
n=2 alpha=0.8 , MSE = 1576.4
n=2 alpha=1.0 , MSE = 1328.8
n=2 alpha=1.2 , MSE = 1143.4
n=2 alpha=1.4 , MSE = 999.06
n=2 alpha=1.6 , MSE = 883.45
n=2 alpha=1.8 , MSE = 788.89


In [96]:
ridgereg1 = Ridge(alpha = parameters[0], normalize=True)
poly1 = PolynomialFeatures(degree = parameters[1])
X_ = poly1.fit_transform(X)
ridgereg1.fit(X_, Y)
result = ridgereg1.predict(X_)
score = metrics.mean_squared_error(result, Y,multioutput='raw_values')
print(score)

[3.36041666]


In [97]:
len(X_)

164

In [98]:
df_new2 = df.loc[df["play"] == 2]
# c = df_new2["genres"].str.get_dummies(",") 
# df_new2 = pd.concat([df_new2,c],axis=1,ignore_index=False)
# list = df_new2.columns.values.tolist()
# del list[0:2],list[2:12]
X = np.array(df_new2[list_train])
Y = np.array(df_new2['playtime_forever'])
Y = Y.reshape(len(Y), 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
score_min = 10000
for pol_order in range(1, 3):
    for alpha in range(0, 20, 2):
        ridgereg = Ridge(alpha = alpha/10, normalize=True)
        poly = PolynomialFeatures(degree = pol_order)
        regr = linear_model.LinearRegression()
        X_ = poly.fit_transform(X_train)
        ridgereg.fit(X_, Y_train)
        
        X_ = poly.fit_transform(X_test)
        result = ridgereg.predict(X_)
        score = metrics.mean_squared_error(result, Y_test)
        
        if score < score_min:
            score_min = score
            parameters = [alpha, pol_order]

        print("n={} alpha={} , MSE = {:<0.5}".format(pol_order, alpha/10, score))

n=1 alpha=0.0 , MSE = 1.6323e+04
n=1 alpha=0.2 , MSE = 530.07
n=1 alpha=0.4 , MSE = 369.01
n=1 alpha=0.6 , MSE = 350.31
n=1 alpha=0.8 , MSE = 362.44
n=1 alpha=1.0 , MSE = 382.2
n=1 alpha=1.2 , MSE = 402.85
n=1 alpha=1.4 , MSE = 422.25
n=1 alpha=1.6 , MSE = 439.81
n=1 alpha=1.8 , MSE = 455.45
n=2 alpha=0.0 , MSE = 2381.0




n=2 alpha=0.2 , MSE = 2132.4
n=2 alpha=0.4 , MSE = 1928.0
n=2 alpha=0.6 , MSE = 1755.8
n=2 alpha=0.8 , MSE = 1608.5
n=2 alpha=1.0 , MSE = 1481.1
n=2 alpha=1.2 , MSE = 1369.9
n=2 alpha=1.4 , MSE = 1272.1
n=2 alpha=1.6 , MSE = 1185.7
n=2 alpha=1.8 , MSE = 1108.9


In [99]:
ridgereg2 = Ridge(alpha = parameters[0], normalize=True)
poly2 = PolynomialFeatures(degree = parameters[1])
X_ = poly2.fit_transform(X)
ridgereg2.fit(X_, Y)
result = ridgereg2.predict(X_)
score = metrics.mean_squared_error(result, Y)
print(score)

621.5641361799593


In [100]:
def preprocess_test(filename):
    df = pd.read_csv(filename,parse_dates = ['purchase_date'])
    df.eval('total_reviews = total_positive_reviews + total_negative_reviews ' , inplace=True)
    df.eval('positive_review_rate = total_positive_reviews / total_reviews ' , inplace=True)
    df['release_date'] = pd.to_datetime(df['release_date'])
    def extract_data(df, column):
        df[column+"year"] = df[column].apply(lambda x:x.year)
        df[column+'month'] = df[column].apply(lambda x: x.month)
    extract_data(df, 'purchase_date')
    extract_data(df, 'release_date')
    df['time_span'] = df.purchase_date-df.release_date
    missing_df = df.isnull().sum(axis=0).reset_index()
    missing_df.columns = ['variable', 'missing values']
    missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
    missing_df.sort_values('filling factor (%)').reset_index(drop = True)
    df.fillna(1,inplace = True)
    df['time_span'] = df['time_span'].astype('timedelta64[D]').astype(int) 
    c = df["genres"].str.get_dummies(",") 
    df = pd.concat([df,c],axis=1,ignore_index=False)
    return df

In [101]:
df = preprocess_test('test.csv')
list = df.columns.values.tolist()
del list[0],list[2:9]
print(list)

['is_free', 'price', 'total_reviews', 'positive_review_rate', 'purchase_dateyear', 'purchase_datemonth', 'release_dateyear', 'release_datemonth', 'time_span', 'Action', 'Adventure', 'Casual', 'Early Access', 'Free to Play', 'Gore', 'Indie', 'Massively Multiplayer', 'Nudity', 'RPG', 'Simulation', 'Sports', 'Strategy', 'Violent']


  app.launch_new_instance()


In [102]:
list_to_add = [a for a in list_train if a not in list]
print(list_to_add)

['Animation & Modeling', 'Audio Production', 'Design & Illustration', 'Racing', 'Sexual Content', 'Utilities']


In [103]:
for i in list_to_add:
    df[i] = 0

In [104]:
X = np.array(df[list_train])

In [105]:
df['playtime_forever'] = clf.predict(X)

In [106]:
df_poly1 = df.loc[df['playtime_forever'] == 1]
# c = df_new1["genres"].str.get_dummies(",") 
# df_new1 = pd.concat([df_new1,c],axis=1,ignore_index=False)
# list = df_new1.columns.values.tolist()
# del list[0:2],list[2:10]


In [107]:
df_poly1

Unnamed: 0,id,is_free,price,genres,categories,tags,purchase_date,release_date,total_positive_reviews,total_negative_reviews,...,Sports,Strategy,Violent,Animation & Modeling,Audio Production,Design & Illustration,Racing,Sexual Content,Utilities,playtime_forever
4,4,False,3400,"Action,Adventure","Single-player,Co-op,Steam Achievements,Full co...","Open World,Action,Comedy,Co-op,Third-Person Sh...",2017-02-24 00:00:00,2013-08-22,40344.0,3708.0,...,0,0,0,0,0,0,0,0,0,1
5,5,False,6800,"Adventure,Indie,RPG","Single-player,Steam Achievements,Full controll...","Indie,Adventure,RPG,Interactive Fiction,Story ...",2018-06-03 00:00:00,2018-02-28,387.0,81.0,...,0,0,0,0,0,0,0,0,0,1
12,12,False,4800,"Action,Adventure,Indie","Single-player,Steam Achievements,Full controll...","Survival,Adventure,Great Soundtrack,Indie,Craf...",1,2016-02-24,1.0,1.0,...,0,0,0,0,0,0,0,0,0,1
13,13,False,4200,"Adventure,Casual,Indie","Single-player,Steam Achievements,Full controll...","Adventure,First-Person,Parkour,Singleplayer,In...",2017-06-28 00:00:00,2014-05-28,9348.0,739.0,...,0,0,0,0,0,0,0,0,0,1
14,14,False,6800,Action,"Single-player,Steam Achievements,Full controll...","Post-apocalyptic,Atmospheric,FPS,Action,Story ...",2017-06-12 00:00:00,2014-08-27,21249.0,4852.0,...,0,0,0,0,0,0,0,0,0,1
15,15,True,0,"Massively Multiplayer,Simulation,Early Access","Single-player,Online Multi-Player,MMO,Online C...","Early Access,Early Access,VR,Simulation,Massiv...",2018-09-01 00:00:00,2017-04-12,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
17,17,False,7800,"Action,Casual,Indie","Single-player,Steam Achievements,Full controll...","Action,Great Soundtrack,Indie,Casual,On-Rails ...",2017-12-01 00:00:00,2017-08-09,572.0,54.0,...,0,0,0,0,0,0,0,0,0,1
18,18,False,6800,"Action,Adventure,Indie,RPG","Single-player,Steam Achievements,Full controll...","Underwater,Action,Adventure,RPG,Indie,Steampun...",2017-12-11 00:00:00,2017-02-02,530.0,277.0,...,0,0,0,0,0,0,0,0,0,1
23,23,False,7500,"Adventure,Casual,Indie,Simulation",Single-player,"VR,Simulation,Adventure,Casual,Indie,Underwate...",2017-11-19 00:00:00,2016-04-05,869.0,156.0,...,0,0,0,0,0,0,0,0,0,1
27,27,False,6800,"Adventure,Casual,Indie","Single-player,Steam Achievements,Steam Trading...","Adventure,Puzzle,Point & Click,Retro,1990's,Cu...",2017-10-25 00:00:00,2016-03-24,1476.0,81.0,...,0,0,0,0,0,0,0,0,0,1


In [108]:
X = np.array(df_poly1[list_train])

In [109]:
X_ = poly1.fit_transform(X)
df_poly1['playtime_forever'] = ridgereg1.predict(X_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [110]:
df_poly1.head(10)

Unnamed: 0,id,is_free,price,genres,categories,tags,purchase_date,release_date,total_positive_reviews,total_negative_reviews,...,Sports,Strategy,Violent,Animation & Modeling,Audio Production,Design & Illustration,Racing,Sexual Content,Utilities,playtime_forever
4,4,False,3400,"Action,Adventure","Single-player,Co-op,Steam Achievements,Full co...","Open World,Action,Comedy,Co-op,Third-Person Sh...",2017-02-24 00:00:00,2013-08-22,40344.0,3708.0,...,0,0,0,0,0,0,0,0,0,2.122789
5,5,False,6800,"Adventure,Indie,RPG","Single-player,Steam Achievements,Full controll...","Indie,Adventure,RPG,Interactive Fiction,Story ...",2018-06-03 00:00:00,2018-02-28,387.0,81.0,...,0,0,0,0,0,0,0,0,0,2.130266
12,12,False,4800,"Action,Adventure,Indie","Single-player,Steam Achievements,Full controll...","Survival,Adventure,Great Soundtrack,Indie,Craf...",1,2016-02-24,1.0,1.0,...,0,0,0,0,0,0,0,0,0,2.212423
13,13,False,4200,"Adventure,Casual,Indie","Single-player,Steam Achievements,Full controll...","Adventure,First-Person,Parkour,Singleplayer,In...",2017-06-28 00:00:00,2014-05-28,9348.0,739.0,...,0,0,0,0,0,0,0,0,0,1.710433
14,14,False,6800,Action,"Single-player,Steam Achievements,Full controll...","Post-apocalyptic,Atmospheric,FPS,Action,Story ...",2017-06-12 00:00:00,2014-08-27,21249.0,4852.0,...,0,0,0,0,0,0,0,0,0,1.890349
15,15,True,0,"Massively Multiplayer,Simulation,Early Access","Single-player,Online Multi-Player,MMO,Online C...","Early Access,Early Access,VR,Simulation,Massiv...",2018-09-01 00:00:00,2017-04-12,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1.353669
17,17,False,7800,"Action,Casual,Indie","Single-player,Steam Achievements,Full controll...","Action,Great Soundtrack,Indie,Casual,On-Rails ...",2017-12-01 00:00:00,2017-08-09,572.0,54.0,...,0,0,0,0,0,0,0,0,0,1.57633
18,18,False,6800,"Action,Adventure,Indie,RPG","Single-player,Steam Achievements,Full controll...","Underwater,Action,Adventure,RPG,Indie,Steampun...",2017-12-11 00:00:00,2017-02-02,530.0,277.0,...,0,0,0,0,0,0,0,0,0,2.153276
23,23,False,7500,"Adventure,Casual,Indie,Simulation",Single-player,"VR,Simulation,Adventure,Casual,Indie,Underwate...",2017-11-19 00:00:00,2016-04-05,869.0,156.0,...,0,0,0,0,0,0,0,0,0,1.419652
27,27,False,6800,"Adventure,Casual,Indie","Single-player,Steam Achievements,Steam Trading...","Adventure,Puzzle,Point & Click,Retro,1990's,Cu...",2017-10-25 00:00:00,2016-03-24,1476.0,81.0,...,0,0,0,0,0,0,0,0,0,1.766025


In [111]:
df_poly2 = df.loc[df['playtime_forever'] == 2]
X = np.array(df_poly2[list_train])
X_ = poly2.fit_transform(X)
df_poly2['playtime_forever'] = ridgereg2.predict(X_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [112]:
df_poly2.head(10)

Unnamed: 0,id,is_free,price,genres,categories,tags,purchase_date,release_date,total_positive_reviews,total_negative_reviews,...,Sports,Strategy,Violent,Animation & Modeling,Audio Production,Design & Illustration,Racing,Sexual Content,Utilities,playtime_forever
1,1,False,11600,"Action,Adventure,Strategy","Single-player,Multi-player,Online Multi-Player...","Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...",2019-07-20 00:00:00,2018-04-24,5762.0,2235.0,...,0,1,0,0,0,0,0,0,0,36.021651
31,31,False,6800,RPG,"Single-player,Steam Achievements,Steam Trading...","Open World,RPG,Fantasy,Adventure,Dragons,Modda...",2016-11-05 00:00:00,2011-11-10,149032.0,9466.0,...,0,0,0,0,0,0,0,0,0,36.073362
71,71,False,8800,"Simulation,Strategy","Single-player,Steam Achievements,Steam Trading...","City Builder,Simulation,Building,Management,St...",2017-12-07 00:00:00,2015-03-10,56571.0,4989.0,...,0,1,0,0,0,0,0,0,0,42.440891
76,76,False,6400,"Action,RPG","Single-player,Co-op,Steam Achievements,Full co...","FPS,Co-op,Loot,RPG,Action,Shooter,Open World,F...",2017-03-17 00:00:00,2012-09-20,121862.0,4810.0,...,0,0,0,0,0,0,0,0,0,31.429756
88,88,True,0,"Action,Adventure,RPG,Simulation,Sports,Strategy","Single-player,Multi-player,Co-op,Online Co-op,...","VR,RPG,Action,Sports,Simulation,Adventure,Stra...",2017-11-19 00:00:00,2017-06-19,5.0,0.0,...,1,1,0,0,0,0,0,0,0,38.230379


In [113]:
df.playtime_forever[df['playtime_forever']==1]=df_poly1['playtime_forever']
df.playtime_forever[df['playtime_forever']==2]=df_poly2['playtime_forever']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [114]:
final = df[['id', 'playtime_forever']]

In [115]:
final

Unnamed: 0,id,playtime_forever
0,0,0.000000
1,1,36.021651
2,2,0.000000
3,3,0.000000
4,4,2.122789
5,5,2.130266
6,6,0.000000
7,7,0.000000
8,8,0.000000
9,9,0.000000


In [119]:
final.to_csv('samplesubmission.csv',index=False)