In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import tree

In [2]:
# read csv
# 直接把目前用不掉的幾個 column drop 掉
store = pd.read_csv('googleplaystore.csv').drop(['Size', 'Current Ver', 'Android Ver'], axis=1)
store.describe(include='all')

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated
count,10841,10841,9367.0,10841.0,10841,10840,10841.0,10840,10841,10841
unique,9660,34,,6002.0,22,3,93.0,6,120,1378
top,ROBLOX,FAMILY,,0.0,"1,000,000+",Free,0.0,Everyone,Tools,"August 3, 2018"
freq,9,1972,,596.0,1579,10039,10040.0,8714,842,326
mean,,,4.193338,,,,,,,
std,,,0.537431,,,,,,,
min,,,1.0,,,,,,,
25%,,,4.0,,,,,,,
50%,,,4.3,,,,,,,
75%,,,4.5,,,,,,,


In [3]:
# find dirty 'Type'
print(store['Type'].unique())
store.loc[(store['Type']!='Free') & (store['Type']!='Paid')].assign()

['Free' 'Paid' nan '0']


Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated
9148,Command & Conquer: Rivals,FAMILY,,0,0,,0,Everyone 10+,Strategy,"June 28, 2018"
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,Free,0.0,Everyone,,"February 11, 2018",1.0.19


In [4]:
# clean up dirty 'Type'
## 'Type' is free if 'Price' is 0
for x in store.loc[store['Type'].isna()].index:
    if(store.loc[x,'Price']==0):
        store.loc[x,'Type']='Free'
    #elif(store.iloc[x]['Price'].isna()):
    #    store = store.drop[x]
    else:
        store.loc[x,'Type']='Paid'
        
## 這筆資料有問題，直接丟掉
for x in store.loc[store['Type']=='0'].index:
    store = store.drop([x])
    
# check dirty 'Type'
print(store['Type'].unique())

['Free' 'Paid']


In [5]:
store = store.dropna()

In [6]:
store.describe(include='all')

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated
count,9366,9366,9366.0,9366.0,9366,9366,9366.0,9366,9366,9366
unique,8196,33,,5992.0,19,2,73.0,6,115,1300
top,ROBLOX,FAMILY,,2.0,"1,000,000+",Free,0.0,Everyone,Tools,"August 3, 2018"
freq,9,1747,,83.0,1577,8719,8719.0,7420,733,319
mean,,,4.191757,,,,,,,
std,,,0.515219,,,,,,,
min,,,1.0,,,,,,,
25%,,,4.0,,,,,,,
50%,,,4.3,,,,,,,
75%,,,4.5,,,,,,,


In [7]:
print(store['Installs'].unique())

['10,000+' '500,000+' '5,000,000+' '50,000,000+' '100,000+' '50,000+'
 '1,000,000+' '10,000,000+' '5,000+' '100,000,000+' '1,000,000,000+'
 '1,000+' '500,000,000+' '100+' '500+' '10+' '5+' '50+' '1+']


In [8]:
install_mapping = {}
for x_num in store['Installs'].unique():
    number = x_num.strip('+').replace(',', '')
    number = int(number)
    install_mapping[x_num] = number

for x in install_mapping:
    install_mapping[x]
    
print(install_mapping)

{'5,000,000+': 5000000, '10+': 10, '5,000+': 5000, '50+': 50, '10,000,000+': 10000000, '1+': 1, '100+': 100, '1,000,000,000+': 1000000000, '1,000,000+': 1000000, '500,000+': 500000, '500+': 500, '100,000+': 100000, '100,000,000+': 100000000, '50,000,000+': 50000000, '10,000+': 10000, '500,000,000+': 500000000, '1,000+': 1000, '5+': 5, '50,000+': 50000}


In [9]:
from statistics import mode
from sklearn import tree
from sklearn.model_selection import KFold, train_test_split

In [10]:
store['Price'] = np.array(list(float(value.strip().strip('$')) for value in store['Price']))
         
#for index, value in enumerate(store.loc[:,'Price']):
#    store.loc[index, 'Price'] = float(value.strip().strip('$'))
#store = store.dropna()

In [11]:
data = store.iloc[:, 2:4]
data['Price'] = store.Price
print(data['Price'].unique())
targ = store['Installs']
train_d, test_d, train_t, test_t = train_test_split(data, targ, test_size=0.3)


[  0.     4.99   3.99   6.99   7.99   5.99   2.99   3.49   1.99   9.99
   7.49   0.99   9.     5.49  10.    24.99  11.99  79.99  16.99  14.99
  29.99  12.99   2.49  10.99   1.5   19.99  15.99  33.99  39.99   3.95
   4.49   1.7    8.99   1.49   3.88 399.99  17.99 400.     3.02   1.76
   4.84   4.77   1.61   2.5    1.59   6.49   1.29 299.99 379.99  37.99
  18.99 389.99   8.49   1.75  14.     2.     3.08   2.59  19.4    3.9
   4.59  15.46   3.04  13.99   4.29   3.28   4.6    1.     2.95   2.9
   1.97   2.56   1.2 ]


In [12]:
answer_list = list(sorted(store['Installs'].unique(), key=lambda x: install_mapping[x]))
class_id = list(x for x in range(len(answer_list)))
print(class_id)

from collections import OrderedDict
dic = OrderedDict(zip(answer_list, class_id))
print(dic)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
OrderedDict([('1+', 0), ('5+', 1), ('10+', 2), ('50+', 3), ('100+', 4), ('500+', 5), ('1,000+', 6), ('5,000+', 7), ('10,000+', 8), ('50,000+', 9), ('100,000+', 10), ('500,000+', 11), ('1,000,000+', 12), ('5,000,000+', 13), ('10,000,000+', 14), ('50,000,000+', 15), ('100,000,000+', 16), ('500,000,000+', 17), ('1,000,000,000+', 18)])


In [13]:
train_tid, test_tid = [], []
#print(train_t.values[0])
for i in range(len(train_t)):
    train_tid.append( dic[ train_t.values[i] ] )

for i in range(len(test_t)):
    test_tid.append( dic[ test_t.values[i] ] )
    
    
#print(train_tid)

In [14]:
# select two features per tree
ord = list( [y, x] for x in range(3) for y in reversed(range(x)))
#print(ord)

In [15]:
# Resubstitution
rf_rs = []
scr_rs = [0] * len(ord)

# training
for i in range(len(ord)):
    dtr = tree.DecisionTreeClassifier()
    dtr = dtr.fit(train_d.iloc[ : , ord[i] ], train_tid)
    # validation
    scr_rs[i] = dtr.score(train_d.iloc[ : , ord[i] ], train_tid)
    rf_rs.append(dtr)
    
# testing
pr_rs = []
for i in range(len(ord)):
    pred = rf_rs[i].predict( test_d.iloc[ : , ord[i] ] )
    for j in range(len(pred)):
        try: pr_rs[j].append( pred[j] )
        except: pr_rs.append( [ pred[j] ] )
            
final_pred = []
for i in range(len(pr_rs)):
    try: final_pred.append(mode(pr_rs[i]))
    except: final_pred.append('')
# print(final_pred)

# confusion matrix
confmx = pd.DataFrame( data=np.zeros((len(class_id), len(class_id)), dtype=int), columns=dic.keys(), index=dic.keys() )
for i in range(len(final_pred)):
    try: confmx.iloc[ test_tid[i], final_pred[i] ] += 1
    except: continue
print(confmx)

# precision/recall
prmx = pd.DataFrame( data=np.zeros((len(class_id), 2), dtype=int), columns=['precision', 'recall'], index=dic.keys() )
for i in range(len(prmx.index)):
    prmx.iloc[i, 0] = confmx.iloc[i, i] / sum(confmx.iloc[ : , i ]) # precision
    prmx.iloc[i, 1] = confmx.iloc[i, i] / sum(confmx.iloc[ i, : ]) # recall
print(prmx)
prmx.assign()

                1+  5+  10+  50+  100+  500+  1,000+  5,000+  10,000+  \
1+               0   0    0    0     0     0       0       0        0   
5+               0   0    0    0     3     0       0       0        0   
10+              0   0    4    0    12     0       1       0        0   
50+              0   0    3    1     8     0       1       0        0   
100+             0   0    8    0    40     1      25       1        0   
500+             0   0    1    0    15     1      23       2        2   
1,000+           0   0    1    0    15     0      81       7       16   
5,000+           0   0    0    0     1     0      30      13       25   
10,000+          0   0    0    0     0     0      25       9      100   
50,000+          0   0    0    0     0     0       2       1       39   
100,000+         0   0    0    0     0     0       1       0       25   
500,000+         0   0    0    0     0     0       0       0        6   
1,000,000+       0   0    0    0     0     0       



Unnamed: 0,precision,recall
1+,,
5+,,0.0
10+,0.235294,0.235294
50+,1.0,0.076923
100+,0.425532,0.533333
500+,0.5,0.022727
"1,000+",0.428571,0.675
"5,000+",0.393939,0.185714
"10,000+",0.465116,0.641026
"50,000+",0.318182,0.17284
