In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor


In [3]:
df_full = pd.read_csv('Archive/Google-Playstore.csv')
df_full.head()

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,Developer Id,Developer Website,Developer Email,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,HTTrack Website Copier,com.httrack.android,Communication,3.6,2848.0,"100,000+",100000.0,351560,True,0.0,...,Xavier Roche,http://www.httrack.com/,roche+android@httrack.com,"Aug 12, 2013","May 20, 2017",Everyone,http://android.httrack.com/privacy-policy.html,False,False,False
1,World War 2: Offline Strategy,com.skizze.wwii,Strategy,4.3,17297.0,"1,000,000+",1000000.0,2161778,True,0.0,...,Skizze Games,http://stereo7.com/,Skizze.Games@gmail.com,"Jul 19, 2018","Nov 26, 2020",Everyone 10+,https://www.iubenda.com/privacy-policy/8032781,True,True,False
2,WPSApp,com.themausoft.wpsapp,Tools,4.2,488639.0,"50,000,000+",50000000.0,79304739,True,0.0,...,TheMauSoft,http://www.themausoft.com,wpsapp.app@gmail.com,"Mar 7, 2016","Oct 21, 2020",Everyone,https://sites.google.com/view/wpsapppolicy/main,True,False,False
3,"OfficeSuite - Office, PDF, Word, Excel, PowerP...",com.mobisystems.office,Business,4.2,1224420.0,"100,000,000+",100000000.0,163660067,True,0.0,...,MobiSystems,http://www.mobisystems.com,support-officesuite-android@mobisystems.com,"Dec 22, 2011","Nov 23, 2020",Everyone,http://www.mobisystems.com/mobile/privacy-poli...,True,True,False
4,Loud Player Free,com.arthelion.loudplayer,Music & Audio,4.2,665.0,"50,000+",50000.0,73463,True,0.0,...,Arthelion92,http://www.arthelion.com,arthelion92@gmail.com,"Sep 24, 2016","Nov 22, 2020",Everyone,http://www.arthelion.com/index.php/fr/android-...,False,False,False


In [4]:
df_full.columns

Index(['App Name', 'App Id', 'Category', 'Rating', 'Rating Count', 'Installs',
       'Minimum Installs', 'Maximum Installs', 'Free', 'Price', 'Currency',
       'Size', 'Minimum Android', 'Developer Id', 'Developer Website',
       'Developer Email', 'Released', 'Last Updated', 'Content Rating',
       'Privacy Policy', 'Ad Supported', 'In App Purchases', 'Editors Choice'],
      dtype='object')

In [5]:
df_full.shape

(1118136, 23)

In [6]:
df_full.describe()

Unnamed: 0,Rating,Rating Count,Minimum Installs,Maximum Installs,Price
count,1111286.0,1111286.0,1117975.0,1118136.0,1118136.0
mean,2.490334,5159.633,313643.2,544453.4,0.2050728
std,2.053973,272409.4,20439410.0,30310580.0,3.541011
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,100.0,160.0,0.0
50%,3.6,11.0,1000.0,1719.0,0.0
75%,4.3,100.0,10000.0,19116.0,0.0
max,5.0,125380800.0,10000000000.0,10772700000.0,400.0


In [7]:
print('Missing Values %')
print("-"*25)
print(round(df_full.isnull().sum()/df_full.shape[0]*100,2))

Missing Values %
-------------------------
App Name              0.00
App Id                0.00
Category              0.00
Rating                0.61
Rating Count          0.61
Installs              0.01
Minimum Installs      0.01
Maximum Installs      0.00
Free                  0.00
Price                 0.00
Currency              0.01
Size                  0.00
Minimum Android       0.18
Developer Id          0.00
Developer Website    37.06
Developer Email       0.00
Released              0.69
Last Updated          0.00
Content Rating        0.00
Privacy Policy       13.73
Ad Supported          0.00
In App Purchases      0.00
Editors Choice        0.00
dtype: float64


In [8]:
df_full.isnull().sum()

App Name                  1
App Id                    0
Category                  3
Rating                 6850
Rating Count           6850
Installs                161
Minimum Installs        161
Maximum Installs          0
Free                      0
Price                     0
Currency                161
Size                      0
Minimum Android        2013
Developer Id              2
Developer Website    414366
Developer Email          22
Released               7730
Last Updated              0
Content Rating            0
Privacy Policy       153524
Ad Supported              0
In App Purchases          0
Editors Choice            0
dtype: int64

###### We see above that, Except for Developer Website & Privacy Policy......we can just directly remove all rows having null values of remaining columns.

#### Transforming the Developer Website and Privacy Policy Columns

In [9]:
#So first replace its non-null values by 1.
df_full.loc[~df_full['Developer Website'].isnull(), 'Developer Website'] = 1
df_full.loc[~df_full['Privacy Policy'].isnull(), 'Privacy Policy'] = 1

#And also we will replace the Null values of Developer Website & Privacy Policy with 0.
df_full['Developer Website'] = df_full['Developer Website'].fillna(0)
df_full['Privacy Policy'] = df_full['Privacy Policy'].fillna(0)

print("After Replacing non-null values:")
df_full.head()

After Replacing non-null values:


Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,Developer Id,Developer Website,Developer Email,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,HTTrack Website Copier,com.httrack.android,Communication,3.6,2848.0,"100,000+",100000.0,351560,True,0.0,...,Xavier Roche,1,roche+android@httrack.com,"Aug 12, 2013","May 20, 2017",Everyone,1,False,False,False
1,World War 2: Offline Strategy,com.skizze.wwii,Strategy,4.3,17297.0,"1,000,000+",1000000.0,2161778,True,0.0,...,Skizze Games,1,Skizze.Games@gmail.com,"Jul 19, 2018","Nov 26, 2020",Everyone 10+,1,True,True,False
2,WPSApp,com.themausoft.wpsapp,Tools,4.2,488639.0,"50,000,000+",50000000.0,79304739,True,0.0,...,TheMauSoft,1,wpsapp.app@gmail.com,"Mar 7, 2016","Oct 21, 2020",Everyone,1,True,False,False
3,"OfficeSuite - Office, PDF, Word, Excel, PowerP...",com.mobisystems.office,Business,4.2,1224420.0,"100,000,000+",100000000.0,163660067,True,0.0,...,MobiSystems,1,support-officesuite-android@mobisystems.com,"Dec 22, 2011","Nov 23, 2020",Everyone,1,True,True,False
4,Loud Player Free,com.arthelion.loudplayer,Music & Audio,4.2,665.0,"50,000+",50000.0,73463,True,0.0,...,Arthelion92,1,arthelion92@gmail.com,"Sep 24, 2016","Nov 22, 2020",Everyone,1,False,False,False


In [10]:
print(df_full.isnull().sum())

App Name                1
App Id                  0
Category                3
Rating               6850
Rating Count         6850
Installs              161
Minimum Installs      161
Maximum Installs        0
Free                    0
Price                   0
Currency              161
Size                    0
Minimum Android      2013
Developer Id            2
Developer Website       0
Developer Email        22
Released             7730
Last Updated            0
Content Rating          0
Privacy Policy          0
Ad Supported            0
In App Purchases        0
Editors Choice          0
dtype: int64


#### Now, lets drop all rows having null values.

In [11]:
df_full = df_full.dropna()

#### So we see that almost 10000 rows are dropped.

In [12]:
df_full.nunique()

App Name             1037544
App Id               1108302
Category                  48
Rating                    42
Rating Count           34744
Installs                  22
Minimum Installs          22
Maximum Installs      206335
Free                       2
Price                    702
Currency                  54
Size                    1481
Minimum Android          137
Developer Id          164538
Developer Website          2
Developer Email       259091
Released                3963
Last Updated            3679
Content Rating             6
Privacy Policy             2
Ad Supported               2
In App Purchases           2
Editors Choice             2
dtype: int64

In [13]:
#Count of each unique value in a column
print(df_full['Content Rating'].value_counts())

Everyone           959868
Teen                98860
Mature 17+          31721
Everyone 10+        17735
Unrated                66
Adults only 18+        52
Name: Content Rating, dtype: int64


In [14]:
df_full[['Installs', 'Minimum Installs', 'Maximum Installs']].head()

Unnamed: 0,Installs,Minimum Installs,Maximum Installs
0,"100,000+",100000.0,351560
1,"1,000,000+",1000000.0,2161778
2,"50,000,000+",50000000.0,79304739
3,"100,000,000+",100000000.0,163660067
4,"50,000+",50000.0,73463


#### So, we see that both the Minimum Installs and Installs are useless columns and we can drop both of them.

#### App Id, Currency, Developer Id, Developer Email also are very specific. So we can remove them too.

In [15]:
df_full.drop(columns = ['Minimum Installs', 'Installs', 'App Id', 'Currency', 'Developer Id', 'Developer Email'], axis = 1, inplace = True)

In [16]:
df_full.nunique()

App Name             1037544
Category                  48
Rating                    42
Rating Count           34744
Maximum Installs      206335
Free                       2
Price                    702
Size                    1481
Minimum Android          137
Developer Website          2
Released                3963
Last Updated            3679
Content Rating             6
Privacy Policy             2
Ad Supported               2
In App Purchases           2
Editors Choice             2
dtype: int64

In [17]:
df_full.head()

Unnamed: 0,App Name,Category,Rating,Rating Count,Maximum Installs,Free,Price,Size,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,HTTrack Website Copier,Communication,3.6,2848.0,351560,True,0.0,2.7M,2.3 and up,1,"Aug 12, 2013","May 20, 2017",Everyone,1,False,False,False
1,World War 2: Offline Strategy,Strategy,4.3,17297.0,2161778,True,0.0,86M,5.1 and up,1,"Jul 19, 2018","Nov 26, 2020",Everyone 10+,1,True,True,False
2,WPSApp,Tools,4.2,488639.0,79304739,True,0.0,5.8M,4.1 and up,1,"Mar 7, 2016","Oct 21, 2020",Everyone,1,True,False,False
3,"OfficeSuite - Office, PDF, Word, Excel, PowerP...",Business,4.2,1224420.0,163660067,True,0.0,59M,4.4 and up,1,"Dec 22, 2011","Nov 23, 2020",Everyone,1,True,True,False
4,Loud Player Free,Music & Audio,4.2,665.0,73463,True,0.0,29M,5.0 and up,1,"Sep 24, 2016","Nov 22, 2020",Everyone,1,False,False,False


#### We should also convert True/False to 1/0 for better use in Modelling. Also, Rating Count can be converted to int.

In [18]:
#True/False mapping to 1/0
df_full["Free"] = df_full["Free"].astype(int)
df_full["Ad Supported"] = df_full["Ad Supported"].astype(int)
df_full["In App Purchases"] = df_full["In App Purchases"].astype(int)
df_full["Editors Choice"] = df_full["Editors Choice"].astype(int)

In [19]:
#Float Rating Count to Integer
df_full["Rating Count"] = df_full["Rating Count"].astype(int)

In [20]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1108302 entries, 0 to 1118135
Data columns (total 17 columns):
App Name             1108302 non-null object
Category             1108302 non-null object
Rating               1108302 non-null float64
Rating Count         1108302 non-null int32
Maximum Installs     1108302 non-null int64
Free                 1108302 non-null int32
Price                1108302 non-null float64
Size                 1108302 non-null object
Minimum Android      1108302 non-null object
Developer Website    1108302 non-null int64
Released             1108302 non-null object
Last Updated         1108302 non-null object
Content Rating       1108302 non-null object
Privacy Policy       1108302 non-null int64
Ad Supported         1108302 non-null int32
In App Purchases     1108302 non-null int32
Editors Choice       1108302 non-null int32
dtypes: float64(2), int32(5), int64(3), object(7)
memory usage: 131.1+ MB


#### Resetting the index after dropping the rows of the Dataframe.

In [21]:
df_full.reset_index(drop=True, inplace=True)

In [22]:
df_full.tail()

Unnamed: 0,App Name,Category,Rating,Rating Count,Maximum Installs,Free,Price,Size,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
1108297,Safer Job Sites,Productivity,0.0,0,5,1,0.0,4.3M,5.0 and up,1,"Aug 21, 2019","Dec 05, 2019",Everyone,1,0,0,0
1108298,AutoThink,Productivity,0.0,0,127,1,0.0,4.7M,4.1 and up,1,"Nov 26, 2019","Nov 13, 2020",Everyone,1,0,0,0
1108299,FieldEnable,Business,0.0,0,282,1,0.0,28M,5.0 and up,1,"Apr 30, 2018","Nov 30, 2020",Everyone,1,0,0,0
1108300,Live Concert,Events,0.0,0,986,1,0.0,8.1M,4.1 and up,1,"Dec 20, 2017","Dec 20, 2017",Everyone,1,0,0,0
1108301,Rajput India,Social,4.6,81,18892,1,0.0,8.8M,4.1 and up,1,"Jun 15, 2017","Sep 27, 2018",Teen,0,1,0,0


In [23]:
'''
unique = set()
for i in range(df_full.shape[0]):
    char = df_full.loc[i, 'Size'][-1]
    unique.add(char)
print(unique)
# {'e', 'k', 'G', 'M'}


n = 0
for i in range(df_full.shape[0]):
    char = df_full.loc[i, 'Size'][-1]
    if char == 'G':
        print(df_full.loc[i, 'Size'])
        n += 1
print(n)
'''

# e => 30600 apps    Varies with Devine
# k => 19000
# M => 105000
# G => 1

"\nunique = set()\nfor i in range(df_full.shape[0]):\n    char = df_full.loc[i, 'Size'][-1]\n    unique.add(char)\nprint(unique)\n# {'e', 'k', 'G', 'M'}\n\n\nn = 0\nfor i in range(df_full.shape[0]):\n    char = df_full.loc[i, 'Size'][-1]\n    if char == 'G':\n        print(df_full.loc[i, 'Size'])\n        n += 1\nprint(n)\n"

In [24]:
#Taking too much time to convert.
#So, Drop for now
df_full.drop(columns=['Size'], axis=1, inplace=True)

'''
for i in range(df_full.shape[0]):
    if i%1000 == 0:
        print(i)
    
    if isinstance(df_full.loc[i, 'Size'], float):
            continue
    
    char = df_full.loc[i, 'Size'][-1]
    
    if char == 'M':
        size_in_mb = round(float(df_full.loc[i, 'Size'][:-1].replace(',', '')), 2)
        df_full.loc[i, 'Size'] = size_in_mb
    
    if char == 'k':
        size_in_mb = round(float(df_full.loc[i, 'Size'][:-1].replace(',', '')) / 1024, 2)
        df_full.loc[i, 'Size'] = size_in_mb
    
    if char == 'G':
        size_in_mb = round(float(df_full.loc[i, 'Size'][:-1]) * 1024, 2)
        df_full.loc[i, 'Size'] = size_in_mb

'''

"\nfor i in range(df_full.shape[0]):\n    if i%1000 == 0:\n        print(i)\n    \n    if isinstance(df_full.loc[i, 'Size'], float):\n            continue\n    \n    char = df_full.loc[i, 'Size'][-1]\n    \n    if char == 'M':\n        size_in_mb = round(float(df_full.loc[i, 'Size'][:-1].replace(',', '')), 2)\n        df_full.loc[i, 'Size'] = size_in_mb\n    \n    if char == 'k':\n        size_in_mb = round(float(df_full.loc[i, 'Size'][:-1].replace(',', '')) / 1024, 2)\n        df_full.loc[i, 'Size'] = size_in_mb\n    \n    if char == 'G':\n        size_in_mb = round(float(df_full.loc[i, 'Size'][:-1]) * 1024, 2)\n        df_full.loc[i, 'Size'] = size_in_mb\n\n"

In [25]:
df_full.head()

Unnamed: 0,App Name,Category,Rating,Rating Count,Maximum Installs,Free,Price,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,HTTrack Website Copier,Communication,3.6,2848,351560,1,0.0,2.3 and up,1,"Aug 12, 2013","May 20, 2017",Everyone,1,0,0,0
1,World War 2: Offline Strategy,Strategy,4.3,17297,2161778,1,0.0,5.1 and up,1,"Jul 19, 2018","Nov 26, 2020",Everyone 10+,1,1,1,0
2,WPSApp,Tools,4.2,488639,79304739,1,0.0,4.1 and up,1,"Mar 7, 2016","Oct 21, 2020",Everyone,1,1,0,0
3,"OfficeSuite - Office, PDF, Word, Excel, PowerP...",Business,4.2,1224420,163660067,1,0.0,4.4 and up,1,"Dec 22, 2011","Nov 23, 2020",Everyone,1,1,1,0
4,Loud Player Free,Music & Audio,4.2,665,73463,1,0.0,5.0 and up,1,"Sep 24, 2016","Nov 22, 2020",Everyone,1,0,0,0


In [26]:
'''
for i in range(df_full.shape[0]):
    if i%1000 == 0:
        print(i)
    
    if isinstance(df_full.loc[i, 'Released'], int):
        continue
    df_full.loc[0, 'Released'] = int(df_full.loc[0, 'Released'][-4:])
    
    if isinstance(df_full.loc[i, 'Last Updated'], int):
        continue
    df_full.loc[0, 'Last Updated'] = int(df_full.loc[0, 'Last Updated'][-4:])
'''
#Not very much related, drop the column as it anyways has less value.
df_full.drop(columns=['Released', 'Last Updated'], axis = 1, inplace=True)

In [27]:
print(df_full['Minimum Android'].unique())
#Too varied, drop the column as unusable in any way.

df_full.drop(columns=['Minimum Android'], axis = 1, inplace=True)

['2.3 and up' '5.1 and up' '4.1 and up' '4.4 and up' '5.0 and up'
 '6.0 and up' '4.2 and up' '4.0 and up' 'Varies with device' '8.0 and up'
 '7.0 and up' '3.2 and up' '4.3 and up' '4.0.3 and up' '2.2 and up'
 '2.1 and up' '7.1 and up' '4.4W and up' '2.3.3 and up' '3.0 and up'
 '1.6 and up' '4.1 - 8.0' '2.0 and up' '2.0.1 and up' '3.1 and up'
 '1.5 and up' '4.1 - 7.0' '1.0 and up' '4.4 - 5.1' '2.3 - 4.4'
 '2.3 - 4.4W' '1.1 and up' '5.0 - 6.0' '2.2 - 4.3' '2.3 - 5.1'
 '4.0.3 - 8.0' '4.2 - 7.1.1' '4.3 - 4.4W' '4.0 - 4.4' '4.1 - 4.4'
 '4.1 - 6.0' '8.0' '2.1 - 5.0' '4.1 - 4.3' '1.5 - 2.1' '4.1 - 5.1' '7.0'
 '2.3 - 3.2' '4.4' '2.3.3 - 6.0' '4.1 - 7.1.1' '4.0 - 4.4W' '4.1 - 4.4W'
 '4.0.3 - 7.1.1' '2.2' '5.0 - 8.0' '3.0 - 5.0' '4.0 - 6.0' '4.4 - 7.1.1'
 '2.2 - 3.2' '2.3 - 4.0.2' '2.1 - 4.4' '2.2 - 4.0.4' '4.0 - 5.0'
 '6.0 - 7.1.1' '2.3.3 - 2.3.4' '2.3.3 - 4.3' '4.0 - 5.1' '4.4 - 7.0'
 '2.3 - 6.0' '2.3 - 7.1.1' '3.0 - 6.0' '2.3 - 5.0' '2.3 - 4.2.2'
 '4.0.3 - 4.4' '3.0 - 4.1.1' '4.2 - 8.0' '4.0 

In [28]:
df_full.head()

Unnamed: 0,App Name,Category,Rating,Rating Count,Maximum Installs,Free,Price,Developer Website,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,HTTrack Website Copier,Communication,3.6,2848,351560,1,0.0,1,Everyone,1,0,0,0
1,World War 2: Offline Strategy,Strategy,4.3,17297,2161778,1,0.0,1,Everyone 10+,1,1,1,0
2,WPSApp,Tools,4.2,488639,79304739,1,0.0,1,Everyone,1,1,0,0
3,"OfficeSuite - Office, PDF, Word, Excel, PowerP...",Business,4.2,1224420,163660067,1,0.0,1,Everyone,1,1,1,0
4,Loud Player Free,Music & Audio,4.2,665,73463,1,0.0,1,Everyone,1,0,0,0


In [29]:
#No Duplicates
#df_full.drop_duplicates(inplace=True)

In [30]:
df_full.Category.value_counts()

Education                  114686
Music & Audio              104165
Entertainment               81762
Books & Reference           78676
Personalization             73230
Tools                       67945
Lifestyle                   54355
Business                    41706
Health & Fitness            31169
Productivity                29943
Photography                 28851
Travel & Local              25691
Finance                     24598
Puzzle                      24533
Food & Drink                24060
Sports                      21875
News & Magazines            21465
Shopping                    20287
Casual                      19915
Communication               18178
Arcade                      17299
Social                      16845
Simulation                  15044
Medical                     12409
Action                      12307
Art & Design                12253
Educational                 11262
Maps & Navigation           10325
Adventure                    9995
Video Players 

In [31]:
df_full.to_csv('cleaned_data.csv')

In [32]:
df_full_dummy = pd.get_dummies(df_full,columns=['Category','Content Rating'],drop_first=True)
# df_full_dummy = pd.get_dummies(df_full,columns=['Category'],drop_first=True)
print(df_full_dummy.shape)
df_full_dummy.head()

(1108302, 63)


Unnamed: 0,App Name,Rating,Rating Count,Maximum Installs,Free,Price,Developer Website,Privacy Policy,Ad Supported,In App Purchases,...,Category_Travel & Local,Category_Trivia,Category_Video Players & Editors,Category_Weather,Category_Word,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,HTTrack Website Copier,3.6,2848,351560,1,0.0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,World War 2: Offline Strategy,4.3,17297,2161778,1,0.0,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
2,WPSApp,4.2,488639,79304739,1,0.0,1,1,1,0,...,0,0,0,0,0,1,0,0,0,0
3,"OfficeSuite - Office, PDF, Word, Excel, PowerP...",4.2,1224420,163660067,1,0.0,1,1,1,1,...,0,0,0,0,0,1,0,0,0,0
4,Loud Player Free,4.2,665,73463,1,0.0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [33]:
x_dummy = df_full_dummy.drop(columns=['Rating', 'App Name'],axis=1)
y_dummy = df_full_dummy.loc[:,'Rating']
print(x_dummy.shape, y_dummy.shape)
x_dummy.head()

(1108302, 61) (1108302,)


Unnamed: 0,Rating Count,Maximum Installs,Free,Price,Developer Website,Privacy Policy,Ad Supported,In App Purchases,Editors Choice,Category_Adventure,...,Category_Travel & Local,Category_Trivia,Category_Video Players & Editors,Category_Weather,Category_Word,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,2848,351560,1,0.0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,17297,2161778,1,0.0,1,1,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,488639,79304739,1,0.0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1224420,163660067,1,0.0,1,1,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,665,73463,1,0.0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [34]:
x_train,x_test,y_train,y_test = train_test_split(x_dummy,y_dummy,test_size=0.25,random_state = 669)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(831226, 61) (831226,) (277076, 61) (277076,)


In [43]:
# Model traing and predicting
def dummy_model_building(model):
    
    model.fit(x_train,y_train)
    print('trained')
    train_score = model.score(x_train , y_train)
    test_score = model.score(x_test , y_test)
    predict = model.predict(x_test)

    print('Train Score on Dummy : {}'.format(train_score))
    print('Test Score on Dummy : {}'.format(test_score))
    print(f'MSE : {mean_squared_error(y_test, predict)}')
#   print(classification_report(y_test, predict))

    print('\n \n')

In [44]:
# DecisionTreeRegressor
dt = DecisionTreeRegressor()
dummy_model_building(dt)
print('\n')

#Tried hyperparameter tuning with max_depth and min_leaf_sample_weight but results remain similar.
#Seems a good model for now.

trained
Train Score on Dummy : 0.9996164528717875
Test Score on Dummy : 0.9017465723301161
MSE : 0.4145921937182097

 





In [45]:
dt.get_depth()

89

In [46]:
#Linear Regression
le = LinearRegression()
dummy_model_building(le)
print('\n')

#Fail Model.

trained
Train Score on Dummy : 0.07305277535349364
Test Score on Dummy : 0.07292027544403579
MSE : 3.9119247630395138

 





In [47]:
# Bagging Classifier
bc = BaggingRegressor()
dummy_model_building(bc)
print('\n')

trained
Train Score on Dummy : 0.9894162308035206
Test Score on Dummy : 0.9407525584963007
MSE : 0.25000172846630503

 





In [48]:
# Gradient Boosting
gc = GradientBoostingRegressor()
dummy_model_building(gc)
print('\n')

trained
Train Score on Dummy : 0.9490515514306204
Test Score on Dummy : 0.9493021965365458
MSE : 0.21392549912078715

 





In [49]:
# AdaBoosting
ac = AdaBoostRegressor()
dummy_model_building(ac)
print('\n')

trained
Train Score on Dummy : 0.9294090620364446
Test Score on Dummy : 0.9295932549507117
MSE : 0.2970897563046575

 



