In [1]:
#Import file & resources

import pandas as pd
import numpy as np


In [2]:
#Read file into df

df = pd.read_csv("Android apps.csv", low_memory=False)

#df


In [3]:
#Filter out non-game app data
print(f"size before filter: {len(df)}")
df = df.loc[df['game']==1]
print(f"size after filtering by game: {len(df)}")

#Filter out invalid data
df = df.loc[(df['numberreviews'] != 'error during scraping') & (df['numberreviews'] != 'rating disabled')]
print(f"size after filtering out invalid numberreviews values: {len(df)}")
#df

size before filter: 50217
size after filtering by game: 11109
size after filtering out invalid numberreviews values: 10621


In [4]:
#Keep features/columns

columns_keep = 'fancyname	company	purchases	ads	sharesinfo	shareslocation	unrestrictedinternet	usersinteract	game	age_rating	Parentalguidance	Downloads	categorygame	low_price_item	top_price_item	price	paidapp	rating	numberreviews'.split('\t')

df = df[columns_keep]
#df

In [5]:
# Drop nan of numberreviews

df_notna = df[df['numberreviews'].notna()]
#df_notna

In [6]:
# Create function to replace specific string characters in numberreviews for conversion purposes

def CleanNumberString(numberString):
    #print(f"number to cleanse: {numberString}")
    cleanedString = numberString.replace(",","").replace("","").replace("k","").replace("+","").replace("m","")
    #print (f"cleaned string: {cleanedString}")
    return cleanedString

In [7]:
# Create function to be chained with CleanNumberString function & convert string numerical values

def CleanNumber(numberString):
    multiplier = 1

    if('k' in numberString):
        multiplier = 1000
    elif('m' in numberString):
        multiplier = 1000000
        
    cleanedString = float(CleanNumberString(numberString)) * multiplier    
    
    return cleanedString

In [8]:
# Create clean numerical values in string format in numberreviews column

df_notna["cleaned_numberReviews"] = df_notna["numberreviews"]
#df_notna
df_notna['cleaned_numberReviews'] = df_notna.apply(lambda row: CleanNumber(row['cleaned_numberReviews']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notna["cleaned_numberReviews"] = df_notna["numberreviews"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notna['cleaned_numberReviews'] = df_notna.apply(lambda row: CleanNumber(row['cleaned_numberReviews']), axis=1)


In [9]:
df_notna[df_notna['numberreviews'].str.contains("m")]

Unnamed: 0,fancyname,company,purchases,ads,sharesinfo,shareslocation,unrestrictedinternet,usersinteract,game,age_rating,Parentalguidance,Downloads,categorygame,low_price_item,top_price_item,price,paidapp,rating,numberreviews,cleaned_numberReviews
41728,MARVEL Spider-Man Unlimited,gameloft,,,,,,,1,12+,0.0,,action,,,free,0.0,4.2,2.4m+,2400000.0
41771,UNO ? & Friends,gameloft,,,,,,,1,everyone,0.0,,card,,,free,0.0,4.1,1.69m+,1690000.0
42130,Where's My Water? Free,disney,,,,,,,1,everyone,0.0,,puzzle,,,free,0.0,4.3,1.4m+,1400000.0
42131,Turbo FAST,pikpok,,,,,,,1,everyone,0.0,,racing,,,free,0.0,4.3,1.34m+,1340000.0
42311,Don't Tap The White Tile,cheetah games,,,,,,,1,everyone,0.0,,arcade,,,free,0.0,4.3,2.96m+,2960000.0
42594,101-in-1 Games,nordcurrent,,,,,,,1,everyone,0.0,,arcade,,,free,0.0,4.4,1.1m+,1100000.0
42795,Online Head Ball 3,masomo gaming,,,,,,,1,everyone,0.0,,sports,,,free,0.0,4.4,1.13m+,1130000.0
44077,<U+25BA> MultiCraft <U+2015> Free Miner! <U+00...,multicraft official,,,,,,,1,12+,0.0,,adventure,,,free,0.0,4.3,1.32m+,1320000.0
44209,Call of Duty?: Heroes,"activision publishing, inc.",,,,,,,1,12+,0.0,,action,,,free,0.0,4.4,1.61m+,1610000.0
47013,Minecraft: Story Mode,telltale games,,,,,,,1,12+,0.0,,adventure,,,free,0.0,4.3,1.11m+,1110000.0


In [10]:
# Attempt to cast cleansed column as int - should throw errors if cleansing is incomplete
df_notna.cleaned_numberReviews = df_notna.cleaned_numberReviews.astype(np.int64)
df_notna

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,fancyname,company,purchases,ads,sharesinfo,shareslocation,unrestrictedinternet,usersinteract,game,age_rating,Parentalguidance,Downloads,categorygame,low_price_item,top_price_item,price,paidapp,rating,numberreviews,cleaned_numberReviews
228,Online Soccer Manager (OSM),gamebasics bv,1.0,1.0,0.0,0.0,0.0,1.0,1,everyone,0.0,10000000.0,sports,0.59,109.99,free,0.0,4.5,1656120,1656120
235,Hungry Shark Evolution,ubisoft entertainment,1.0,1.0,0.0,0.0,0.0,0.0,1,16+,0.0,100000000.0,arcade,2.29,109.99,free,0.0,4.5,6495234,6495234
246,Zynga Poker ? Free Texas Holdem Online Card Games,zynga,1.0,1.0,1.0,0.0,0.0,1.0,1,12+,0.0,50000000.0,casino,0.59,350.00,free,0.0,4.5,2315552,2315552
247,Sudoku - The Clean One,dustland design,1.0,1.0,0.0,0.0,0.0,0.0,1,everyone,0.0,1000000.0,puzzle,1.89,2.99,free,0.0,4.6,6639,6639
254,8 Ball Pool,miniclip.com,1.0,1.0,0.0,0.0,0.0,0.0,1,everyone,0.0,500000000.0,sports,0.79,99.99,free,0.0,4.5,17053699,17053699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50172,TETRIS Blitz,electronic arts,1.0,0.0,0.0,0.0,0.0,0.0,1,everyone,0.0,10000000.0,puzzle,1.09,109.99,free,0.0,4.2,250381,250381
50188,Worms 2: Armageddon,team 17 digital limited,0.0,0.0,0.0,0.0,0.0,0.0,1,7+,0.0,1000000.0,strategy,,,4.49,1.0,3.9,61603,61603
50190,Harry Potter: Wizards Unite,"niantic, inc.",1.0,0.0,0.0,0.0,0.0,0.0,1,7+,0.0,5000000.0,adventure,1.09,109.99,free,0.0,3.9,268773,268773
50192,Small World 2,no info,,,,,,,1,no info,0.0,,board,,,free,0.0,4.3,5.66k+,5660


In [11]:
# Get distribution of number of reviews values

df_notna['cleaned_numberReviews'].describe()


count    1.052500e+04
mean     1.564649e+05
std      1.021994e+06
min      1.000000e+00
25%      1.010000e+03
50%      8.789000e+03
75%      5.307300e+04
max      5.020212e+07
Name: cleaned_numberReviews, dtype: float64

In [12]:
# Check lengths before/after removal of first quartile figures

print(len(df_notna))

df_notna = df_notna.loc[df_notna['cleaned_numberReviews'] >= 1000]
print(len(df_notna))

10525
7905


In [13]:
# Double check ratings data for any quirks

df_notna.rating.unique()

array(['4.5', '4.6', '3.4', '4.3', '4.2', '4.4', '3.8', '4.0', '4.7',
       '3.2', '3.5', '4.1', '3.7', '3.9', '3.6', '3.3', '4.8', '3.1',
       '3.0', '2.5', '2.9', '2.7', '2.6', '2.8', '2.3', '2.4', '2.2',
       '4.9', '1.7', '2.0', '1.9', '1.5', '2.1', '1.8', '1.3', '1.6'],
      dtype=object)

In [14]:
# Cast ratings as float type

df_notna.rating = df_notna.rating.astype(np.float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [15]:
# Double check latest dtypes on df

df_notna.dtypes

fancyname                 object
company                   object
purchases                float64
ads                      float64
sharesinfo               float64
shareslocation           float64
unrestrictedinternet     float64
usersinteract            float64
game                       int64
age_rating                object
Parentalguidance         float64
Downloads                float64
categorygame              object
low_price_item            object
top_price_item            object
price                     object
paidapp                  float64
rating                   float64
numberreviews             object
cleaned_numberReviews      int64
dtype: object

In [16]:
# Export clean csv for machine learning purposes

df_notna.to_csv(".\cleaned.csv", index=False)

In [17]:
# test_text = "1 2 3 kplus"

# re.search(regex_searchstring, test_text)



NameError: name 're' is not defined

In [None]:
# import re 

# regex_searchstring = '[a-zA-Z+-]'

# f = lambda row: row.apply(str).str.replace(".","").str.contains(keyword ,na=False, flags=re.IGNORECASE)

# df_filter = df_notna.numberreviews.str.contains('[a-zA-Z+-]')

# df_filter = df_notna.loc[df_notna['numberreviews'].apply(lambda x: if re.search(regex_searchstring, x)): True else: False]

# df_filter = df_notna.loc[df_notna['numberreviews'].str.contains('[a-zA-Z+-]', regex=True)]

# df_filter = df_notna['numberreviews'].contains('[a-zA-Z+-]')
# df_filter

# remove non numeric characters, ",;.''

In [None]:
#Filter out rows where numberreviews is lower than threshold


In [None]:
#Drop columns as described


In [None]:
#Look for weird data rows e.g. 'errors during scraping' & look at distribution among ratings


In [None]:
#Create another column that rounds ratings up and look at distribution
