In [1]:
#Import file & resources

import pandas as pd
import numpy as np


In [21]:
#Read file into df

df = pd.read_csv("Android apps.csv", low_memory=False)

#df


In [22]:
#Filter out non-game app data
print(f"size before filter: {len(df)}")
df = df.loc[df['game']==1]
print(f"size after filtering by game: {len(df)}")

#Filter out invalid data
df = df.loc[(df['numberreviews'] != 'error during scraping') & (df['numberreviews'] != 'rating disabled')]
print(f"size after filtering out invalid numberreviews values: {len(df)}")
#df

size before filter: 50217
size after filtering by game: 11109
size after filtering out invalid numberreviews values: 10621


In [23]:
#Keep features/columns

columns_keep = 'appname	fancyname	company	purchases	ads	sharesinfo	shareslocation	unrestrictedinternet	usersinteract	game	age_rating	Parentalguidance	Downloads	categorygame	low_price_item	top_price_item	price	paidapp	rating	numberreviews'.split('\t')

df = df[columns_keep]
#df

In [24]:
# Drop nan of numberreviews

df_notna = df[df['numberreviews'].notna()]
#df_notna

In [25]:
# Create function to replace specific string characters in numberreviews for conversion purposes

def CleanNumberString(numberString):
    #print(f"number to cleanse: {numberString}")
    cleanedString = numberString.replace(",","").replace("","").replace("k","").replace("+","").replace("m","")
    #print (f"cleaned string: {cleanedString}")
    return cleanedString

In [26]:
# Create function chained with CleanNumberString to convert numerical objects to numerical values

def CleanNumber(numberString):
    multiplier = 1

    if('k' in numberString):
        multiplier = 1000
    elif('m' in numberString):
        multiplier = 1000000
        
    cleanedString = float(CleanNumberString(numberString)) * multiplier    
    
    return cleanedString

In [27]:
# Create clean numerical values in string format in numberreviews column

df_notna["cleaned_numberReviews"] = df_notna["numberreviews"]
#df_notna
df_notna['cleaned_numberReviews'] = df_notna.apply(lambda row: CleanNumber(row['cleaned_numberReviews']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [28]:
# Double check whether function is working as planned

df_notna[df_notna['numberreviews'].str.contains("m")]

Unnamed: 0,appname,fancyname,company,purchases,ads,sharesinfo,shareslocation,unrestrictedinternet,usersinteract,game,...,Parentalguidance,Downloads,categorygame,low_price_item,top_price_item,price,paidapp,rating,numberreviews,cleaned_numberReviews
41728,com.gameloft.android.ANMP.GloftSIHM,MARVEL Spider-Man Unlimited,gameloft,,,,,,,1,...,0.0,,action,,,free,0.0,4.2,2.4m+,2400000.0
41771,com.gameloft.android.ANMP.GloftUOHM,UNO ? & Friends,gameloft,,,,,,,1,...,0.0,,card,,,free,0.0,4.1,1.69m+,1690000.0
42130,com.disney.WMWLite,Where's My Water? Free,disney,,,,,,,1,...,0.0,,puzzle,,,free,0.0,4.3,1.4m+,1400000.0
42131,com.pikpok.turbo,Turbo FAST,pikpok,,,,,,,1,...,0.0,,racing,,,free,0.0,4.3,1.34m+,1340000.0
42311,com.umonistudio.tile,Don't Tap The White Tile,cheetah games,,,,,,,1,...,0.0,,arcade,,,free,0.0,4.3,2.96m+,2960000.0
42594,com.nordcurrent.Games101,101-in-1 Games,nordcurrent,,,,,,,1,...,0.0,,arcade,,,free,0.0,4.4,1.1m+,1100000.0
42795,com.izmo.onlinekafatopu,Online Head Ball 3,masomo gaming,,,,,,,1,...,0.0,,sports,,,free,0.0,4.4,1.13m+,1130000.0
44077,mobi.MultiCraft,<U+25BA> MultiCraft <U+2015> Free Miner! <U+00...,multicraft official,,,,,,,1,...,0.0,,adventure,,,free,0.0,4.3,1.32m+,1320000.0
44209,com.activision.callofduty.heroes,Call of Duty?: Heroes,"activision publishing, inc.",,,,,,,1,...,0.0,,action,,,free,0.0,4.4,1.61m+,1610000.0
47013,com.telltalegames.minecraft100,Minecraft: Story Mode,telltale games,,,,,,,1,...,0.0,,adventure,,,free,0.0,4.3,1.11m+,1110000.0


In [29]:
# Attempt to cast cleansed column as int - should throw errors if cleansing is incomplete
df_notna.cleaned_numberReviews = df_notna.cleaned_numberReviews.astype(np.int64)
df_notna

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,appname,fancyname,company,purchases,ads,sharesinfo,shareslocation,unrestrictedinternet,usersinteract,game,...,Parentalguidance,Downloads,categorygame,low_price_item,top_price_item,price,paidapp,rating,numberreviews,cleaned_numberReviews
228,com.gamebasics.osm,Online Soccer Manager (OSM),gamebasics bv,1.0,1.0,0.0,0.0,0.0,1.0,1,...,0.0,10000000.0,sports,0.59,109.99,free,0.0,4.5,1656120,1656120
235,com.fgol.HungrySharkEvolution,Hungry Shark Evolution,ubisoft entertainment,1.0,1.0,0.0,0.0,0.0,0.0,1,...,0.0,100000000.0,arcade,2.29,109.99,free,0.0,4.5,6495234,6495234
246,com.zynga.livepoker,Zynga Poker ? Free Texas Holdem Online Card Games,zynga,1.0,1.0,1.0,0.0,0.0,1.0,1,...,0.0,50000000.0,casino,0.59,350.00,free,0.0,4.5,2315552,2315552
247,ee.dustland.android.dustlandsudoku,Sudoku - The Clean One,dustland design,1.0,1.0,0.0,0.0,0.0,0.0,1,...,0.0,1000000.0,puzzle,1.89,2.99,free,0.0,4.6,6639,6639
254,com.miniclip.eightballpool,8 Ball Pool,miniclip.com,1.0,1.0,0.0,0.0,0.0,0.0,1,...,0.0,500000000.0,sports,0.79,99.99,free,0.0,4.5,17053699,17053699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50172,com.ea.tetrisfree_row,TETRIS Blitz,electronic arts,1.0,0.0,0.0,0.0,0.0,0.0,1,...,0.0,10000000.0,puzzle,1.09,109.99,free,0.0,4.2,250381,250381
50188,com.worms2armageddon.apptnb,Worms 2: Armageddon,team 17 digital limited,0.0,0.0,0.0,0.0,0.0,0.0,1,...,0.0,1000000.0,strategy,,,4.49,1.0,3.9,61603,61603
50190,com.nianticlabs.hpwu.prod.ares,Harry Potter: Wizards Unite,"niantic, inc.",1.0,0.0,0.0,0.0,0.0,0.0,1,...,0.0,5000000.0,adventure,1.09,109.99,free,0.0,3.9,268773,268773
50192,com.daysofwonder.smallworld2_humble,Small World 2,no info,,,,,,,1,...,0.0,,board,,,free,0.0,4.3,5.66k+,5660


In [30]:
# Get distribution of number of reviews values

df_notna['cleaned_numberReviews'].describe()


count    1.052500e+04
mean     1.564649e+05
std      1.021994e+06
min      1.000000e+00
25%      1.010000e+03
50%      8.789000e+03
75%      5.307300e+04
max      5.020212e+07
Name: cleaned_numberReviews, dtype: float64

In [31]:
# Check lengths before/after removal of first quartile figures

print(len(df_notna))

df_notna = df_notna.loc[df_notna['cleaned_numberReviews'] >= 1000]
print(len(df_notna))

10525
7905


In [32]:
# Double check ratings data for any quirks

df_notna.rating.unique()

array(['4.5', '4.6', '3.4', '4.3', '4.2', '4.4', '3.8', '4.0', '4.7',
       '3.2', '3.5', '4.1', '3.7', '3.9', '3.6', '3.3', '4.8', '3.1',
       '3.0', '2.5', '2.9', '2.7', '2.6', '2.8', '2.3', '2.4', '2.2',
       '4.9', '1.7', '2.0', '1.9', '1.5', '2.1', '1.8', '1.3', '1.6'],
      dtype=object)

In [33]:
# Cast ratings as float type

df_notna.rating = df_notna.rating.astype(np.float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [16]:
# Double check latest dtypes on df

df_notna.dtypes

fancyname                 object
company                   object
purchases                float64
ads                      float64
sharesinfo               float64
shareslocation           float64
unrestrictedinternet     float64
usersinteract            float64
game                       int64
age_rating                object
Parentalguidance         float64
Downloads                float64
categorygame              object
low_price_item            object
top_price_item            object
price                     object
paidapp                  float64
rating                   float64
numberreviews             object
cleaned_numberReviews      int64
dtype: object

In [16]:
# Export clean csv for machine learning purposes

#df_notna.to_csv(".\cleaned.csv", index=False)

In [17]:
# Load Google play data
df_gp = pd.read_csv("Google-Playstore.csv", low_memory=False)

In [18]:
df_gp

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,Developer Id,Developer Website,Developer Email,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,HTTrack Website Copier,com.httrack.android,Communication,3.6,2848.0,"100,000+",100000.0,351560,True,0.0,...,Xavier Roche,http://www.httrack.com/,roche+android@httrack.com,"Aug 12, 2013","May 20, 2017",Everyone,http://android.httrack.com/privacy-policy.html,False,False,False
1,World War 2: Offline Strategy,com.skizze.wwii,Strategy,4.3,17297.0,"1,000,000+",1000000.0,2161778,True,0.0,...,Skizze Games,http://stereo7.com/,Skizze.Games@gmail.com,"Jul 19, 2018","Nov 26, 2020",Everyone 10+,https://www.iubenda.com/privacy-policy/8032781,True,True,False
2,WPSApp,com.themausoft.wpsapp,Tools,4.2,488639.0,"50,000,000+",50000000.0,79304739,True,0.0,...,TheMauSoft,http://www.themausoft.com,wpsapp.app@gmail.com,"Mar 7, 2016","Oct 21, 2020",Everyone,https://sites.google.com/view/wpsapppolicy/main,True,False,False
3,"OfficeSuite - Office, PDF, Word, Excel, PowerP...",com.mobisystems.office,Business,4.2,1224420.0,"100,000,000+",100000000.0,163660067,True,0.0,...,MobiSystems,http://www.mobisystems.com,support-officesuite-android@mobisystems.com,"Dec 22, 2011","Nov 23, 2020",Everyone,http://www.mobisystems.com/mobile/privacy-poli...,True,True,False
4,Loud Player Free,com.arthelion.loudplayer,Music & Audio,4.2,665.0,"50,000+",50000.0,73463,True,0.0,...,Arthelion92,http://www.arthelion.com,arthelion92@gmail.com,"Sep 24, 2016","Nov 22, 2020",Everyone,http://www.arthelion.com/index.php/fr/android-...,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118131,FleetEnable,com.fleetenable.app,Productivity,,,10+,10.0,10,True,0.0,...,Imaginnovate,http://fleetenable.com/,krishna@imaginnovate.com,,"Jun 30, 2020",Everyone,http://fleetenable.com/,False,False,False
1118132,AutoThink,com.fleetenable.autothink,Productivity,0.0,0.0,100+,100.0,127,True,0.0,...,Imaginnovate,http://www.imaginnovate.com,krishna@imaginnovate.com,"Nov 26, 2019","Nov 13, 2020",Everyone,http://www.imaginnovate.com,False,False,False
1118133,FieldEnable,com.fieldenable.app.fieldenable,Business,0.0,0.0,100+,100.0,282,True,0.0,...,Imaginnovate,http://www.imaginnovate.com,FieldEnable@imaginnovate.com,"Apr 30, 2018","Nov 30, 2020",Everyone,http://tw.fieldenable.com/privacy,False,False,False
1118134,Live Concert,com.varbin.liveconcert,Events,0.0,0.0,500+,500.0,986,True,0.0,...,Varbin Softwares,http://varbin.com,varbinsoftware@gmail.com,"Dec 20, 2017","Dec 20, 2017",Everyone,http://varbin.com/live-concert-privacy-policy,False,False,False


In [36]:

df_merged = pd.merge(df_notna, df_gp[["App Id", "Released", "Last Updated"]], how='inner', left_on='appname', right_on='App Id')

In [46]:
df_merged = df_merged.drop("App Id", axis=1)

In [47]:
df_merged

Unnamed: 0,appname,fancyname,company,purchases,ads,sharesinfo,shareslocation,unrestrictedinternet,usersinteract,game,...,low_price_item,top_price_item,price,paidapp,rating,numberreviews,cleaned_numberReviews,Released,Last Updated,CompanyIsExperienced
0,com.gamebasics.osm,Online Soccer Manager (OSM),gamebasics bv,1.0,1.0,0.0,0.0,0.0,1.0,1,...,0.59,109.99,free,0.0,4.5,1656120,1656120,"Oct 4, 2012","Nov 26, 2020",False
1,com.fgol.HungrySharkEvolution,Hungry Shark Evolution,ubisoft entertainment,1.0,1.0,0.0,0.0,0.0,0.0,1,...,2.29,109.99,free,0.0,4.5,6495234,6495234,"Feb 22, 2013","Oct 27, 2020",True
2,com.zynga.livepoker,Zynga Poker ? Free Texas Holdem Online Card Games,zynga,1.0,1.0,1.0,0.0,0.0,1.0,1,...,0.59,350.00,free,0.0,4.5,2315552,2315552,"Dec 21, 2010","Nov 04, 2020",True
3,ee.dustland.android.dustlandsudoku,Sudoku - The Clean One,dustland design,1.0,1.0,0.0,0.0,0.0,0.0,1,...,1.89,2.99,free,0.0,4.6,6639,6639,"Jul 20, 2017","Oct 29, 2020",False
4,com.miniclip.eightballpool,8 Ball Pool,miniclip.com,1.0,1.0,0.0,0.0,0.0,0.0,1,...,0.79,99.99,free,0.0,4.5,17053699,17053699,"Jan 23, 2013","Nov 26, 2020",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6037,com.berniiiiiiii.logomatchup,"""Memory"" - Memory game",berni mobile,,,,,,,1,...,,,free,0.0,4.1,84.91k+,84910,"Oct 26, 2012","Nov 28, 2020",True
6038,com.gipnetix.berryking,Berry King,gameborn inc.,,,,,,,1,...,,,free,0.0,4.4,35.66k+,35660,"Jul 11, 2014","Aug 20, 2020",False
6039,com.smilerlee.jewels,Jewels Switch,words mobile,,,,,,,1,...,,,free,0.0,4.7,728.57k+,728570,"Jun 13, 2013","Dec 26, 2019",True
6040,com.geargames.pfp,Prehistoric Park Builder,gear games,,,,,,,1,...,,,free,0.0,4.6,316.56k+,316560,"Nov 28, 2012","Feb 17, 2014",True


In [48]:
df_companies = df_merged.groupby("company")["company"].count().reset_index(name="count")
df_companies

Unnamed: 0,company,count
0,(<u+c8fc>)<u+c5d4><u+be44><u+c83c><u+d06c><u+b...,1
1,(andrei & aleksandr krupiankou),1
2,0.1%,2
3,0km apps,1
4,100500games.org,1
...,...,...
3142,zut!,1
3143,zuuks games,3
3144,zy puzzle games,1
3145,zynga,19


In [49]:
companiesList = df_companies.to_dict('records')
companiesList

[{'company': '(<u+c8fc>)<u+c5d4><u+be44><u+c83c><u+d06c><u+b9ac><u+c5d0><u+c774><u+d2f0><u+be0c>',
  'count': 1},
 {'company': '(andrei & aleksandr krupiankou)', 'count': 1},
 {'company': '0.1%', 'count': 2},
 {'company': '0km apps', 'count': 1},
 {'company': '100500games.org', 'count': 1},
 {'company': '101xp limited', 'count': 1},
 {'company': '10p studio', 'count': 2},
 {'company': '11 bit studios', 'count': 2},
 {'company': '111%', 'count': 4},
 {'company': '137studio', 'count': 1},
 {'company': '1492 studio', 'count': 9},
 {'company': '17studio', 'count': 1},
 {'company': '17th pixel', 'count': 1},
 {'company': '17th pixel poland', 'count': 1},
 {'company': '1905 games', 'count': 1},
 {'company': '1bsyl', 'count': 3},
 {'company': '1coin', 'count': 1},
 {'company': '1der entertainment', 'count': 1},
 {'company': '1games', 'count': 1},
 {'company': '1gravity llc', 'count': 1},
 {'company': '2048 game', 'count': 1},
 {'company': '21g.', 'count': 1},
 {'company': '22cans', 'count': 1

In [41]:
def IsCompanyExperienced(companiesList, company):
    companyGamesCount = next((listCompany["count"] for listCompany in companiesList if company == listCompany["company"]), None)
    if(companyGamesCount > 1):
        return True
    else:
        return False

In [43]:
df_merged["CompanyIsExperienced"] = df_merged.apply(lambda row: IsCompanyExperienced(companiesList, row["company"]),axis=1)

In [44]:
df_merged

Unnamed: 0,appname,fancyname,company,purchases,ads,sharesinfo,shareslocation,unrestrictedinternet,usersinteract,game,...,top_price_item,price,paidapp,rating,numberreviews,cleaned_numberReviews,App Id,Released,Last Updated,CompanyIsExperienced
0,com.gamebasics.osm,Online Soccer Manager (OSM),gamebasics bv,1.0,1.0,0.0,0.0,0.0,1.0,1,...,109.99,free,0.0,4.5,1656120,1656120,com.gamebasics.osm,"Oct 4, 2012","Nov 26, 2020",False
1,com.fgol.HungrySharkEvolution,Hungry Shark Evolution,ubisoft entertainment,1.0,1.0,0.0,0.0,0.0,0.0,1,...,109.99,free,0.0,4.5,6495234,6495234,com.fgol.HungrySharkEvolution,"Feb 22, 2013","Oct 27, 2020",True
2,com.zynga.livepoker,Zynga Poker ? Free Texas Holdem Online Card Games,zynga,1.0,1.0,1.0,0.0,0.0,1.0,1,...,350.00,free,0.0,4.5,2315552,2315552,com.zynga.livepoker,"Dec 21, 2010","Nov 04, 2020",True
3,ee.dustland.android.dustlandsudoku,Sudoku - The Clean One,dustland design,1.0,1.0,0.0,0.0,0.0,0.0,1,...,2.99,free,0.0,4.6,6639,6639,ee.dustland.android.dustlandsudoku,"Jul 20, 2017","Oct 29, 2020",False
4,com.miniclip.eightballpool,8 Ball Pool,miniclip.com,1.0,1.0,0.0,0.0,0.0,0.0,1,...,99.99,free,0.0,4.5,17053699,17053699,com.miniclip.eightballpool,"Jan 23, 2013","Nov 26, 2020",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6037,com.berniiiiiiii.logomatchup,"""Memory"" - Memory game",berni mobile,,,,,,,1,...,,free,0.0,4.1,84.91k+,84910,com.berniiiiiiii.logomatchup,"Oct 26, 2012","Nov 28, 2020",True
6038,com.gipnetix.berryking,Berry King,gameborn inc.,,,,,,,1,...,,free,0.0,4.4,35.66k+,35660,com.gipnetix.berryking,"Jul 11, 2014","Aug 20, 2020",False
6039,com.smilerlee.jewels,Jewels Switch,words mobile,,,,,,,1,...,,free,0.0,4.7,728.57k+,728570,com.smilerlee.jewels,"Jun 13, 2013","Dec 26, 2019",True
6040,com.geargames.pfp,Prehistoric Park Builder,gear games,,,,,,,1,...,,free,0.0,4.6,316.56k+,316560,com.geargames.pfp,"Nov 28, 2012","Feb 17, 2014",True
