In [12]:
# https://github.com/JoMingyu/google-play-scraper
# pip install google-play-scraper

from google_play_scraper import app

import pandas as pd

from tqdm.notebook import tqdm

from time import sleep

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/train.csv', usecols=['bundle', 'os'])

df = df[df['os'].isin(['Android', 'android'])].reset_index(drop=True)
df.drop(columns=['os'], inplace=True)

df.head()

Unnamed: 0,bundle
0,com.MadOut.BIG
1,com.easybrain.solitaire.klondike.free
2,com.orbitalknight.ridiculousfreekick
3,tcouchgind.scooterextreme.scooter
4,com.FidgetTrading3D.game


In [4]:
# выберем уникальные значения
unique_bundles = df['bundle'].unique()

# освободим память
del df

In [5]:
print(len(unique_bundles))

73271


In [12]:
# создадим датафрейм для хранения данных из парсера
pars_res_df = pd.DataFrame()

# создадим список для бандлов, на которых парсер не отработал
err_bundles = []

In [13]:
for app_bundle in tqdm(unique_bundles):

    try:
        result = app(
            app_bundle,
            lang='en', # defaults to 'en'
            country='ru' # defaults to 'us'
        )
        pars_res_df = pars_res_df.append(pd.DataFrame(pd.Series(result)).transpose()).reset_index(drop=True)
    except:
        err_bundles.append(app_bundle)
    
    #sleep(0.2)

  0%|          | 0/23271 [00:00<?, ?it/s]

# Соберем бакеты парсинга в один файл

In [190]:
import pandas as pd
import glob

path = r'../data/parsing_results' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [191]:
print(frame.shape)
frame.head()

(59369, 52)


Unnamed: 0,title,description,descriptionHTML,summary,summaryHTML,installs,minInstalls,score,ratings,reviews,histogram,price,free,currency,sale,saleTime,originalPrice,saleText,offersIAP,inAppProductPrice,size,androidVersion,androidVersionText,developer,developerId,developerEmail,developerWebsite,developerAddress,privacyPolicy,developerInternalID,genre,genreId,icon,headerImage,screenshots,video,videoImage,contentRating,contentRatingDescription,adSupported,containsAds,released,updated,version,recentChanges,recentChangesHTML,comments,editorsChoice,similarApps,moreByDeveloper,appId,url
0,MadOut2 BigCityOnline,Number one mobile game with open world! \r\n\r...,Number one mobile game with open world! <br><b...,Number One mobile game with open world,Number One mobile game with open world,"10,000,000+",10000000.0,4.182135,591273.0,84059.0,"[68210, 19167, 38791, 75642, 389463]",0.0,True,RUB,False,,,,True,"RUB 75.00 - RUB 5,499.00 per item",Varies with device,4.4,4.4 and up,MadOut Games,MadOut+Games,MadOutContacts@gmail.com,http://madoutgames.com/,"Kazakstan, Astana",http://www.madoutgames.com/privacy,5.592404e+18,Racing,GAME_RACING,https://play-lh.googleusercontent.com/2GjCUPLJ...,https://play-lh.googleusercontent.com/3gYOTXU4...,['https://play-lh.googleusercontent.com/JQLpIT...,https://www.youtube.com/embed/VhIBkdK0UII?ps=p...,https://play-lh.googleusercontent.com/3gYOTXU4...,Rated for 18+,Extreme Violence,True,True,"May 1, 2017",1620986000.0,10.27,Improved performance,Improved performance,"['*EDIT* This ""Russian GTA"" is sucks, just for...",False,"['com.gameloft.android.ANMP.GloftOLHM', 'com.g...",,com.MadOut.BIG,https://play.google.com/store/apps/details?id=...
1,Solitaire Klondike,Solitaire is a time-tested classic card game e...,Solitaire is a time-tested classic card game e...,Klondike Solitaire is a classic card game to t...,Klondike Solitaire is a classic card game to t...,"5,000,000+",5000000.0,4.372654,65158.0,1191.0,"[5152, 1567, 3144, 9257, 46038]",0.0,True,RUB,False,,,,True,RUB 479.00 per item,77M,5.0,5.0 and up,Easybrain,7473634688510685864,support@easybrain.com,http://www.easybrain.com,"3 Krinou street\nThe Oval, 6th Floor\nLimassol...",https://easybrain.com/privacy,7.473635e+18,Card,GAME_CARD,https://play-lh.googleusercontent.com/4lB-0lSR...,https://play-lh.googleusercontent.com/FTlCtOhw...,['https://play-lh.googleusercontent.com/HMcSZA...,https://www.youtube.com/embed/LQwkQ5_B-5I?ps=p...,https://play-lh.googleusercontent.com/FTlCtOhw...,Rated for 12+,Nudity,True,True,"May 19, 2021",1635255000.0,2.6.0,- New game mode added! Master your strategy to...,- New game mode added! Master your strategy to...,['As far as an app dedicated solely to solitai...,False,"['at.ner.SolitaireKlondike', 'com.easyfun.soli...",,com.easybrain.solitaire.klondike.free,https://play.google.com/store/apps/details?id=...
2,Crazy Kick!,Get to the goal!\r\n\r\n\r\nOvertake your oppo...,Get to the goal!<br><br><br>Overtake your oppo...,"Dribble, shoot and score!","Dribble, shoot and score!","10,000,000+",10000000.0,4.113662,149456.0,2051.0,"[17156, 5999, 13030, 19735, 93536]",0.0,True,RUB,False,,,,True,RUB 50.00 - RUB 259.00 per item,53M,4.4,4.4 and up,VOODOO,VOODOO,support@voodoo.io,https://www.voodoo.io,"4 rue Jules Lefebvre, 75009 Paris",https://www.voodoo.io/privacy,8.306042e+18,Arcade,GAME_ARCADE,https://play-lh.googleusercontent.com/X26iLdhs...,https://play-lh.googleusercontent.com/3Iqn4A9w...,['https://play-lh.googleusercontent.com/lhqXPO...,,,Rated for 3+,,True,True,"Aug 30, 2019",1633098000.0,1.18.2,- bugfixes & improvements,- bugfixes &amp; improvements,"[""It's very good. I can handle the adds by tur...",False,"['com.hyper.ballbrawl', 'com.masomo.headball2'...","['com.anvil.skirt', 'com.studio501.canvasrun',...",com.orbitalknight.ridiculousfreekick,https://play.google.com/store/apps/details?id=...
3,Fidget Trading 3D - Fidget Toys,Want to collect ALL The Fidgets In The World? ...,Want to collect ALL The Fidgets In The World? ...,Trade & Collect Satisfying Fidgets,Trade &amp; Collect Satisfying Fidgets,"10,000,000+",10000000.0,3.415289,101655.0,1035.0,"[23745, 8175, 13858, 13858, 42019]",0.0,True,RUB,False,,,,False,,82M,5.0,5.0 and up,MagicLab,MagicLab,cetciz@hotmail.com,http://www.maglab.com.tr/,Üniversiteler Mahallesi İhsan Doğramacı Bulvar...,https://yildirim.me/privacy/,8.363139e+18,Casual,GAME_CASUAL,https://play-lh.googleusercontent.com/aHhGDVFk...,https://play-lh.googleusercontent.com/aJAYdiD6...,['https://play-lh.googleusercontent.com/rEKVz9...,https://www.youtube.com/embed/4D9QoKSf6Hs?ps=p...,https://play-lh.googleusercontent.com/aJAYdiD6...,Rated for 3+,,True,True,"Jun 29, 2021",1632132000.0,1.2.8,-New fidgets added\r\n-AI improvements\r\n-Bet...,-New fidgets added<br>-AI improvements<br>-Bet...,"[""This game is really nice I really like it bu...",False,['com.dobroapps.anti.stress'],"['com.magiclab.insatiaio', 'com.magiclab.snail...",com.FidgetTrading3D.game,https://play.google.com/store/apps/details?id=...
4,Brain Wash - Thinking Game,<b>How often do you talk to your brain?</b>\r\...,<b>How often do you talk to your brain?</b><br...,Be smart! Train your brain and logic in over m...,Be smart! Train your brain and logic in over m...,"50,000,000+",50000000.0,4.462635,287850.0,7228.0,"[21905, 6597, 10702, 25809, 222837]",0.0,True,RUB,False,,,,True,RUB 269.00 per item,126M,4.4,4.4 and up,SayGames Ltd,6392896734092635573,brain_wash_android@say.games,https://say.games/,,https://saygames.by/privacy-policy/brainwash,6.392897e+18,Puzzle,GAME_PUZZLE,https://play-lh.googleusercontent.com/x-APduqO...,https://play-lh.googleusercontent.com/i5if9TfU...,['https://play-lh.googleusercontent.com/Vv20Xr...,,,Rated for 12+,Sexual Innuendo,True,True,"Jun 30, 2020",1627721000.0,1.30.0,internal improvements and bug fixes,internal improvements and bug fixes,['It would be great if there were only 390 lev...,False,"['com.playstrom.dop2', 'com.playstrom.bob', 'c...",,com.hwg.sos,https://play.google.com/store/apps/details?id=...


# Список нужных признаков:
appId, title, description, summary, minInstalls, score, ratings, reviews, price, free, size, androidVersion, developer, genre, contentRating, contentRatingDescription, adSupported, containsAds,released, editorsChoice, moreByDeveloper

In [192]:
selected_features = ['appId',
                    'title',
                    'description',
                    'summary',
                    'minInstalls',
                    'score',
                    'ratings',
                    'reviews',
                    'price',
                    'free',
                    'size',
                    'androidVersion',
                    'developer',
                    'genre',
                    'contentRating',
                    'contentRatingDescription',
                    'adSupported',
                    'containsAds',
                    'released',
                    'editorsChoice',
                    'moreByDeveloper'
                    ]

frame = frame[selected_features]

In [193]:
frame.head()

Unnamed: 0,appId,title,description,summary,minInstalls,score,ratings,reviews,price,free,size,androidVersion,developer,genre,contentRating,contentRatingDescription,adSupported,containsAds,released,editorsChoice,moreByDeveloper
0,com.MadOut.BIG,MadOut2 BigCityOnline,Number one mobile game with open world! \r\n\r...,Number One mobile game with open world,10000000.0,4.182135,591273.0,84059.0,0.0,True,Varies with device,4.4,MadOut Games,Racing,Rated for 18+,Extreme Violence,True,True,"May 1, 2017",False,
1,com.easybrain.solitaire.klondike.free,Solitaire Klondike,Solitaire is a time-tested classic card game e...,Klondike Solitaire is a classic card game to t...,5000000.0,4.372654,65158.0,1191.0,0.0,True,77M,5.0,Easybrain,Card,Rated for 12+,Nudity,True,True,"May 19, 2021",False,
2,com.orbitalknight.ridiculousfreekick,Crazy Kick!,Get to the goal!\r\n\r\n\r\nOvertake your oppo...,"Dribble, shoot and score!",10000000.0,4.113662,149456.0,2051.0,0.0,True,53M,4.4,VOODOO,Arcade,Rated for 3+,,True,True,"Aug 30, 2019",False,"['com.anvil.skirt', 'com.studio501.canvasrun',..."
3,com.FidgetTrading3D.game,Fidget Trading 3D - Fidget Toys,Want to collect ALL The Fidgets In The World? ...,Trade & Collect Satisfying Fidgets,10000000.0,3.415289,101655.0,1035.0,0.0,True,82M,5.0,MagicLab,Casual,Rated for 3+,,True,True,"Jun 29, 2021",False,"['com.magiclab.insatiaio', 'com.magiclab.snail..."
4,com.hwg.sos,Brain Wash - Thinking Game,<b>How often do you talk to your brain?</b>\r\...,Be smart! Train your brain and logic in over m...,50000000.0,4.462635,287850.0,7228.0,0.0,True,126M,4.4,SayGames Ltd,Puzzle,Rated for 12+,Sexual Innuendo,True,True,"Jun 30, 2020",False,


# Подготовим некоторые признаки

In [194]:
# Преобразуем признак с другими играми издателя в их количество
frame['moreByDeveloper'] = frame['moreByDeveloper'].fillna('[]').apply(eval).apply(len)

In [195]:
# Преобразуем размер файла в целое число
frame['size'] = frame['size'].replace('Varies with device', '-1M')

frame['size'] = frame['size'].str.replace(',', '.').str.extract(r'(\d+[.\d]*)').astype(float)[0]

In [196]:
# Преобразуем версию андроида
frame['androidVersion'] = frame['androidVersion'].replace('Varies', '-1')

frame['androidVersion'] = frame['androidVersion'].str.split('.').str[0].str.extract(r'(\d+[.\d]*)').astype(float)[0]

In [197]:
# Преобразуем дату публикации и вытащим из нее признаки
frame['released'] = pd.to_datetime(frame['released'])

frame['release_day'] = frame['released'].dt.day
frame['release_month'] = frame['released'].dt.month
frame['release_year'] = frame['released'].dt.year

# удалим колонку с исходной датой
frame.drop(columns=['released'], inplace=True)

In [198]:
# Преобразуем оставшиеся булевы признаки в бинарный вид
for bool_feature in ['free', 'adSupported', 'containsAds', 'editorsChoice']:
    frame[bool_feature] = frame[bool_feature].fillna(-1000).astype(int)

In [202]:
# Почистим текстовые данные
frame['description'] = frame['description'].str.replace("\r\n", "", regex=True)\
    .str.replace("\t", "", regex=True)\
    .str.replace('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '')\
    .str.replace("["
        u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
       "]+", "", regex=True)

In [203]:
frame['summary'] = frame['summary'].str.replace("\r\n", "", regex=True)\
    .str.replace("\t", "", regex=True)\
    .str.replace('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '')\
    .str.replace("["
        u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
       "]+", "", regex=True)

In [204]:
frame.head()

Unnamed: 0,appId,title,description,summary,minInstalls,score,ratings,reviews,price,free,size,androidVersion,developer,genre,contentRating,contentRatingDescription,adSupported,containsAds,editorsChoice,moreByDeveloper,release_day,release_month,release_year
0,com.MadOut.BIG,MadOut2 BigCityOnline,Number one mobile game with open world! Online...,Number One mobile game with open world,10000000.0,4.182135,591273.0,84059.0,0.0,1,1.0,4.0,MadOut Games,Racing,Rated for 18+,Extreme Violence,1,1,0,0,1.0,5.0,2017.0
1,com.easybrain.solitaire.klondike.free,Solitaire Klondike,Solitaire is a time-tested classic card game e...,Klondike Solitaire is a classic card game to t...,5000000.0,4.372654,65158.0,1191.0,0.0,1,77.0,5.0,Easybrain,Card,Rated for 12+,Nudity,1,1,0,0,19.0,5.0,2021.0
2,com.orbitalknight.ridiculousfreekick,Crazy Kick!,"Get to the goal!Overtake your opponents, move ...","Dribble, shoot and score!",10000000.0,4.113662,149456.0,2051.0,0.0,1,53.0,4.0,VOODOO,Arcade,Rated for 3+,,1,1,0,5,30.0,8.0,2019.0
3,com.FidgetTrading3D.game,Fidget Trading 3D - Fidget Toys,Want to collect ALL The Fidgets In The World? ...,Trade & Collect Satisfying Fidgets,10000000.0,3.415289,101655.0,1035.0,0.0,1,82.0,5.0,MagicLab,Casual,Rated for 3+,,1,1,0,5,29.0,6.0,2021.0
4,com.hwg.sos,Brain Wash - Thinking Game,"How often do you talk to your brain? Hey, frie...",Be smart! Train your brain and logic in over m...,50000000.0,4.462635,287850.0,7228.0,0.0,1,126.0,4.0,SayGames Ltd,Puzzle,Rated for 12+,Sexual Innuendo,1,1,0,0,30.0,6.0,2020.0


In [205]:
# сохраним итоговый файл
frame.to_csv('../data/parsed_final.csv', index=False)