In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,ConfusionMatrixDisplay,classification_report


# Data Acquisition

In [None]:
df = pd.read_csv("/kaggle/input/google-playstore-apps/Google-Playstore.csv")
df.head()

In [None]:
for col in df.columns:
    col1 = col.replace(' ','')
    df = df.rename(columns={col:col1})
df.columns

# Data Exploration and Ceaning

In [None]:
print("Dropping the following columns - AppId, DeveloperWebsite, DeveloperEmail, PrivacyPolicy, Currency, DeveloperId, ScrapedTime, MinimumAndroid")
df = df.drop(['AppId','DeveloperWebsite','DeveloperEmail','PrivacyPolicy','Currency','DeveloperId','ScrapedTime','MinimumAndroid'],axis=1)
df.head()

In [None]:
print("Number of features in the dataset : ",df.shape[1])

In [None]:
print("Dataset information",df.info())

In [None]:
print("Number of rows having null values in the dataset:")
missing_info = (len(df[df.isnull().any(axis=1)]) / len(df) )*100
print(len(df[df.isnull().any(axis=1)]),' which is ' ,round(missing_info,2) , '%')

In [None]:
print("Features having null values in the dataset:")
df.isnull().any()

In [None]:
cols = df.columns[df.isnull().any()].to_list()
print("Columns having null values are :",cols)

for c in cols:
    print(c,type(c),": ",df[c].isnull().sum())

In [None]:
df.dropna(subset=['Size','MinimumInstalls','Installs','AppName'],inplace=True)
df.dropna(subset=['AppName'],inplace=True)

In [None]:
df['Rating']  = df['Rating'].astype(float)
avg = round(df['Rating'].mean(),1)
df['Rating'].fillna(avg,inplace=True)

df['RatingCount']  = df['RatingCount'].astype(float)
avg = round(df['RatingCount'].mean(),1)
df['RatingCount'].fillna(avg,inplace=True)

In [None]:
df['ContentRating'].value_counts()

In [None]:
df['ContentRating'] = df['ContentRating'].replace('Unrated',"Everyone")

#Cleaning other values just to include Everyone, Teens and Adult 

df['ContentRating'] = df['ContentRating'].replace('Mature 17+',"Adults")
df['ContentRating'] = df['ContentRating'].replace('Adults only 18+',"Adults")
df['ContentRating'] = df['ContentRating'].replace('Everyone 10+',"Everyone")

In [None]:
# CLeaning the Installs column so as to convert it into numeric
df.Installs = df.Installs.str.replace(',','')
df.Installs = df.Installs.str.replace('+','')
df.Installs = df.Installs.str.replace('Free','0')
df['Installs'] = pd.to_numeric(df['Installs'])

In [None]:
df['PriceRange'] = pd.cut(df['Price'],bins=[0,0.19,9.99,29.99,410],labels=['Free','Low','Mid','High'],include_lowest=True)
#dummies = pd.get_dummies(df['PriceRange'],prefix='Price')
#df = df.join(dummies)
df['PriceRange'].value_counts()

In [None]:
print(df.Free.value_counts())
print("Apps that have Price = 0, have Free column True")
df.loc[(df.Price==0) & (df.Free==False),'Free'] = True
print(df.Free.value_counts())

In [None]:
df['Type'] = np.where(df['Free'] == True,'Free','Paid')
df.drop(['Free'],inplace=True,axis=1)

In [None]:
df['RatingType'] = 'NoRating'
df.loc[(df['RatingCount'] > 0) & (df['RatingCount'] <= 10000.0),'RatingType'] = 'Less than 10K'
df.loc[(df['RatingCount'] > 10000) & (df['RatingCount'] <= 500000.0),'RatingType'] = 'Between 10K and 500K'
df.loc[(df['RatingCount'] > 500000) & (df['RatingCount'] <= 138557570.0),'RatingType'] = 'More than 500K'
df.RatingType.value_counts()

# Data Visualization¶

In [None]:
plt.figure(figsize=(10,10))
ax = sns.countplot(df['ContentRating'],hue=df['Type']);
plt.title("ContentRating in Free and Paid")

for p in ax.patches:
    ax.annotate(p.get_height(),(p.get_x() + p.get_width()/2,p.get_height()),ha='center',size= 20)
    
plt.show()

In [None]:
plt.figure(figsize=(18,18))
ax = sns.countplot(df['Installs'],hue=df['Type']);
plt.title("Number of Installs in different Types ")

plt.xticks(fontsize=10,fontweight='bold',rotation=45,ha='right');
plt.show()

In [None]:
#draw a boxplot map to observe app's ratings among different categories
f, ax = plt.subplots(figsize=(15, 15))
sns.boxplot(x="Rating", y="Category", data=df,palette="Pastel1",order = df['Category'].value_counts().index)
plt.title("Ratings by Category", fontsize = '20')
plt.ylabel('Category',fontsize = '15')
plt.xlabel('Rating',fontsize = '15');

In [None]:
category_rating = df.groupby(['Category'])['RatingCount'].count()

plt.figure(figsize=(15,10))
sns.barplot(category_rating.index, category_rating.values)
plt.title('Number of Ratings Per Category')
plt.xlabel('Category')
plt.ylabel('Rating')
plt.xticks(fontsize=10,fontweight='bold',rotation=45,ha='right');

Education,Tools,Busines, Musis & Audio have more ratings

In [None]:
plt.figure(figsize=(15,15))
plt.title("Categories in Free/Paid")
sns.heatmap(pd.crosstab(df['Category'], df["Type"]),annot=True,fmt='g', cmap="Pastel1_r")
plt.show()

In [None]:
m = df.RatingCount.max()
df[df.RatingCount==m]

In [None]:
plt.figure(figsize=(15,15))
plt.title("Categories based on their Content")
sns.heatmap(pd.crosstab(df['Category'], df["ContentRating"]),annot=True,fmt='g', cmap="Pastel1_r");
plt.show()

    Teens have more apps in Music & Audio,Social, Entertainment
    Adults have more apps in Entertainment,Music & Audio,Personalization            

In [None]:
df_e = df.loc[(df.Installs > 1000000) & (df.RatingCount > 1000000) & (df.Category == "Social")]

df_e = df_e.sort_values(by=['RatingCount'],ascending=False).head(10)

plt.figure(figsize=(15,10))

ax = sns.barplot(df_e.AppName, df_e.Rating,palette='coolwarm_r')#'Set3_r')
for p in ax.patches:
    ax.annotate(p.get_height(),(p.get_x() + p.get_width()/2,p.get_height()),ha='center',size= 20)
plt.title("Top Apps in Social Category based on RatingCount and Installs with their Ratings")
plt.xticks(fontsize=10,fontweight='bold',rotation=45,ha='right');
plt.show()

In [None]:
df_e = df.loc[(df.Installs > 1000000) & (df.RatingCount > 1000000) & (df.Category == "Entertainment")]

df_e = df_e.sort_values(by=['RatingCount'],ascending=False).head(10)

plt.figure(figsize=(15,10))

ax = sns.barplot(df_e.AppName, df_e.Rating,palette='Set3_r')
for p in ax.patches:
    ax.annotate(p.get_height(),(p.get_x() + p.get_width()/2,p.get_height()),ha='center',size= 20)
plt.title("Top Apps in Entertainment Category based on RatingCount and Installs with their Ratings")
plt.xticks(fontsize=10,fontweight='bold',rotation=45,ha='right');
plt.show()

In [None]:
df_e = df.loc[(df.Installs > 1000000) & (df.RatingCount > 1000000) & (df.Category == "Education")]

df_e = df_e.sort_values(by=['RatingCount'],ascending=False).head(10)

plt.figure(figsize=(15,10))

ax = sns.barplot(df_e.AppName, df_e.Rating,palette='Set2_r')
for p in ax.patches:
    ax.annotate(p.get_height(),(p.get_x() + p.get_width()/2,p.get_height()),ha='center',size= 20)
plt.title("Top Apps in Education Category based on RatingCount and Installs with their Ratings")
plt.xticks(fontsize=10,fontweight='bold',rotation=45,ha='right');
plt.show()

In [None]:
df_e = df.loc[(df.Installs > 1000000) & (df.RatingCount > 1000000) & (df.Category == "Communication")]

df_e = df_e.sort_values(by=['RatingCount'],ascending=False).head(10)

plt.figure(figsize=(15,10))

ax = sns.barplot(df_e.AppName, df_e.Rating,palette='Set2_r')
for p in ax.patches:
    ax.annotate(p.get_height(),(p.get_x() + p.get_width()/2,p.get_height()),ha='center',size= 20)
plt.title("Top Apps in Education Communication based on RatingCount and Installs with their Ratings")
plt.xticks(fontsize=10,fontweight='bold',rotation=45,ha='right');
plt.show()

In [None]:
# Top paid apps based on their ratings and Installs

paid_apps = df[(df.Type=='Paid') & (df.Installs == 5000000) | ((df.Type=='Paid') & (df.Installs == 10000000))]
paid_apps = paid_apps.groupby(['AppName'])['Rating'].max().sort_values(ascending=False)
paid_apps = paid_apps.head(10)
plt.title("Top Paid Apps based on highest ratings and installs")

ax = sns.lineplot(x=paid_apps.values,y=paid_apps.index,color='green');

In [None]:
# Top Free apps based on their ratings and installs

free_apps = df[(df.Type=='Free') & (df.Installs == 10000000)]
                                                   
free_apps = free_apps.groupby(['AppName'])['Rating'].max().sort_values(ascending=False)
free_apps = free_apps.head(10)
plt.title("Top Free Apps based on highest ratings and more installs")
sns.lineplot(x=free_apps.values,y=free_apps.index,color='orange');

In [None]:
x = pd.crosstab(df['Category'],df['PriceRange'])
x.plot(kind='bar',stacked=True,figsize=(15,15))
plt.title("Category Vs PriceRange")
plt.xticks(fontsize=10,fontweight='bold',rotation=45,ha='right');

# Data Modeling

In [None]:
df['Category'] = pd.factorize(df['Category'])[0].astype(int)
df['Type'] = pd.factorize(df['Type'])[0].astype(int)
df['ContentRating'] = pd.factorize(df['ContentRating'])[0].astype(int)
df['AdSupported'] = pd.factorize(df['AdSupported'])[0].astype(int)
df['EditorsChoice'] = pd.factorize(df['EditorsChoice'])[0].astype(int)
df['InAppPurchases'] = pd.factorize(df['InAppPurchases'])[0].astype(int)
df['RatingType'] = pd.factorize(df['RatingType'])[0].astype(int)
df['PriceRange'] = pd.factorize(df['PriceRange'])[0].astype(int)

In [None]:
X = df.drop(['AppName','Size', 'MinimumInstalls', 'Released','RatingCount' ,'Type','MaximumInstalls','Price','LastUpdated','Rating','RatingType'],axis=1)
y = df['RatingType'].values

In [None]:
corr = X.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, cmap="coolwarm",square=True, annot=True)
plt.title("Correlation Matrix", fontsize = '17');#

In [None]:
scaler = StandardScaler()
X['Installs'] = scaler.fit_transform(X[['Installs']])

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.3,random_state=20)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
# A simple RandomForestClassifier without any parameter tuning
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_pred,y_test)*100
print("Accuracy =",round(rf_acc,2),"%")
cm = confusion_matrix(y_pred,y_test)

cmd = ConfusionMatrixDisplay(cm,display_labels =['NoRating','Less than 10K','Between 10K and 500K','More than 500K'])
fig, ax = plt.subplots(figsize=(12,12));
plt.title("Confusion Matrix RandomForestClassifier")
cmd.plot(ax=ax);

In [None]:
target_names = ['NoRating','Less than 10K','Between 10K and 500K','More than 500K']
cr = classification_report(y_test,y_pred,target_names = target_names)
print("Classification Report for RandomForestClassifier")
print(cr)

In [None]:
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train,y_train)
y_pred = gb_model.predict(X_test)
gb_acc = accuracy_score(y_pred,y_test)*100
print("Accuracy =",round(gb_acc,2),"%")
cm = confusion_matrix(y_pred,y_test)

cmd = ConfusionMatrixDisplay(cm,display_labels =['NoRating','Less than 10K','Between 10K and 500K','More than 500K'])
fig, ax = plt.subplots(figsize=(12,12));
plt.title("Confustion Matrix for GradientBoostingClassifier")
cmd.plot(ax=ax);

In [None]:
target_names = ['NoRating','Less than 10K','Between 10K and 500K','More than 500K']
cr = classification_report(y_test,y_pred,target_names = target_names)
print("Classification Report for GradientBoostingClassifier")
print(cr)

In [None]:
print("Please Upvote and Comment if you like this work")