# Google Play

In [1]:
import pandas as pd

# 加载数据集
path = r'C:\Users\11435\Desktop\googleplayAPP.csv'
df = pd.read_csv(path)

# 查看数据集的前几行以了解数据结构
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,15-Jan-18,2.0.0,4.0.3 and up
2,"U Launcher Lite 鈥?FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up


## App不同类目

In [2]:
keys_to_drop = ['4.3', '4.6', '4.1', '3.9', '4.2', '4.5', '4', '3.7', '4.7', '3.8', '3.1', '000+"']
df = df[~df['Category'].isin(keys_to_drop)]
df['Category'].value_counts()

Category
GAME                   546
FAMILY                 515
MEDICAL                309
TOOLS                  268
DATING                 223
HEALTH_AND_FITNESS     221
COMMUNICATION          202
PHOTOGRAPHY            197
PERSONALIZATION        196
PRODUCTIVITY           183
SHOPPING               167
SPORTS                 161
EDUCATION              156
SOCIAL                 151
ENTERTAINMENT          148
FINANCE                144
TRAVEL_AND_LOCAL       139
LIFESTYLE              131
BUSINESS               129
NEWS_AND_MAGAZINES     113
FOOD_AND_DRINK          84
VIDEO_PLAYERS           82
BOOKS_AND_REFERENCE     73
HOUSE_AND_HOME          71
MAPS_AND_NAVIGATION     61
AUTO_AND_VEHICLES       56
ART_AND_DESIGN          56
PARENTING               51
LIBRARIES_AND_DEMO      47
WEATHER                 47
EVENTS                  46
COMICS                  41
BEAUTY                  41
Name: count, dtype: int64

## 平均销售最高和最低的类目

In [8]:
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
avg_price_by_category = df.groupby('Type')['Price'].mean()

min_avg_price_type = avg_price_by_category.idxmin()
max_avg_price_type = avg_price_by_category.idxmax()

print("平均销售价格最低的类别:", min_avg_price_type)
print("平均销售价格最高的类别:", max_avg_price_type)

平均销售价格最低的类别: Free
平均销售价格最高的类别: Free


## 评分最高和最低的10个APP

In [5]:
df_sorted = df.sort_values(by='Rating', ascending=False)

print("评分最高的10个 APP：")
print(df_sorted.head(10))

print("\n评分最低的10个 APP：")
print(df_sorted.tail(10))

评分最高的10个 APP：
                                App             Category Rating Reviews  Size  \
2477         Basics of Orthopaedics              MEDICAL      5       1  5.6M   
1547                   Eternal life   LIBRARIES_AND_DEMO      5      26  2.5M   
2452               Galaxies of Hope              MEDICAL      5       2   24M   
4988           Easy Hotspot Ad Free                TOOLS      5       2  3.3M   
4539  R Programing Offline Tutorial  BOOKS_AND_REFERENCE      5       4  3.9M   
2450                Tablet Reminder              MEDICAL      5       4  2.5M   
1038      Mindvalley U Tallinn 2018               EVENTS      5       1   21M   
1030                     Prosperity               EVENTS      5      16  2.3M   
615                    Awake Dating               DATING      5       2   70M   
4477      Android P Style Icon Pack      PERSONALIZATION      5       1   60M   

     Installs  Type   Price Content Rating             Genres Last Updated  \
2477      10+  F

## 最受欢迎的类目Top3

In [6]:
# 对 'Type' 列进行数量统计
type_counts = df['Type'].value_counts()

# 找到数量前三的类型
top3_types = type_counts.nlargest(3)

# 输出结果
print("最受欢迎的类型 Top3：")
print(top3_types)

最受欢迎的类型 Top3：
Type
Free      4736
Paid       319
$1.49        2
Name: count, dtype: int64


## 更新最勤快的App

In [7]:
def check_date_format(date_str, format_str='%d-%b-%y'):
    try:
        pd.to_datetime(date_str, format=format_str)
        return True
    except ValueError:
        return False

# 过滤不符合指定格式的日期时间字符串
df['Last Updated'] = df['Last Updated'][df['Last Updated'].apply(check_date_format)]

# 将 "Last Updated" 列转换为日期时间格式
df['Last Updated'] = pd.to_datetime(df['Last Updated'], format='%d-%b-%y')

# 按照 "Last Updated" 列降序排序
df_sorted = df.sort_values(by='Last Updated', ascending=False)

# 找到更新最勤快的前五个 App
top5_apps = df_sorted.head(5)

# 输出结果
print("更新最勤快的 App Top5：")
print(top5_apps.App)

更新最勤快的 App Top5：
3679                                         OBJECTIVE
3840                              Sygic Car Navigation
3328    Emoji keyboard - Cute Emoticons, GIF, Stickers
3590                      Baby Food - Homemade Recipes
3687                           Google Play Movies & TV
Name: App, dtype: object
