In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor



In [None]:
df_playstore = pd.read_csv('Archive/Google-Playstore.csv')
df_playstore.head()

In [None]:
df_playstore.columns

In [None]:
df_playstore.shape

In [None]:
df_playstore.describe()

In [None]:
print('Missing Values %')
print("-"*25)
print(round(df_playstore.isnull().sum()/df_playstore.shape[0]*100,2))

In [None]:
df_playstore.isnull().sum()

###### We see above that, Except for Developer Website & Privacy Policy......we can just directly remove all rows having null values of remaining columns.

#### Transforming the Developer Website and Privacy Policy Columns

In [None]:
#So first replace its non-null values by 1.
df_playstore.loc[~df_playstore['Developer Website'].isnull(), 'Developer Website'] = 1
df_playstore.loc[~df_playstore['Privacy Policy'].isnull(), 'Privacy Policy'] = 1

#And also we will replace the Null values of Developer Website & Privacy Policy with 0.
df_playstore['Developer Website'] = df_playstore['Developer Website'].fillna(0)
df_playstore['Privacy Policy'] = df_playstore['Privacy Policy'].fillna(0)

print("After Replacing non-null values:")
df_playstore.head()

In [None]:
print(df_playstore.isnull().sum())

#### Now, lets drop all rows having null values.

In [None]:
df_playstore = df_playstore.dropna()

#### So we see that almost 10000 rows are dropped.

In [None]:
df_playstore.nunique()

In [None]:
#Count of each unique value in a column
print(df_playstore['Content Rating'].value_counts())

In [None]:
df_playstore[['Installs', 'Minimum Installs', 'Maximum Installs']].head()

#### So, we see that both the Minimum Installs and Installs are useless columns and we can drop both of them.

#### App Id, Currency, Developer Id, Developer Email also are very specific. So we can remove them too.

In [None]:
df_playstore.drop(columns = ['Minimum Installs', 'Installs', 'App Id', 'Currency', 'Developer Id', 'Developer Email', 'Size'], axis = 1, inplace = True)

In [None]:
df_playstore.nunique()

In [None]:
df_playstore.head()

#### We should also convert True/False to 1/0 for better use in Modelling. Also, Rating Count can be converted to int.

In [None]:
#True/False mapping to 1/0
df_playstore["Free"] = df_playstore["Free"].astype(int)
df_playstore["Ad Supported"] = df_playstore["Ad Supported"].astype(int)
df_playstore["In App Purchases"] = df_playstore["In App Purchases"].astype(int)
df_playstore["Editors Choice"] = df_playstore["Editors Choice"].astype(int)

In [None]:
#Float Rating Count to Integer
df_playstore["Rating Count"] = df_playstore["Rating Count"].astype(int)

In [None]:
df_playstore.info()

#### Resetting the index after dropping the rows of the Dataframe.

In [None]:
df_playstore.reset_index(drop=True, inplace=True)

In [None]:
df_playstore.tail()

In [None]:
df_playstore.head()

In [None]:
#Release Dats and Last Updated Dates shouldnt matter a lot in the rating of a good app, drop the column as it has less value.
df_playstore.drop(columns=['Released', 'Last Updated'], axis = 1, inplace=True)

In [None]:
print(df_playstore['Minimum Android'].unique())
#Too varied, drop the column as unusable in any way.

df_playstore.drop(columns=['Minimum Android'], axis = 1, inplace=True)

In [None]:
df_playstore.head()

In [None]:
#No Duplicates
#df_playstore.drop_duplicates(inplace=True)

In [None]:
df_playstore.Category.value_counts()

In [None]:
#df_playstore.to_csv('cleaned_data.csv')

In [None]:
df_playstore.head()

## Visualizations

In [None]:
plt.figure(figsize=(12,5))
p = sns.set(style="darkgrid")
p = sns.countplot(x='Category',data=df_playstore)
_ = plt.setp(p.get_xticklabels(), rotation=90)  # Rotate labels
plt.title('App Category',size = 20);

Here, we can see that most number of apps developes belongs to Education, Music, and Entertainment Category. 
By this we can infer that these 3 sectors are in demand

In [None]:
df_playstore.Rating.unique()

In [None]:
df_playstore.Rating.value_counts()

In [None]:
df_playstore.Rating.describe()

#### The rating is on a scale on 1-5, with 1 being minimum and 5 being maximum. The mean rating is 2.49 and the median rating is 3.6

In [None]:
plt.figure(figsize=(8,8))
plt.title('Content Rating distribution')
df_playstore['Content Rating'].value_counts().plot(kind='bar')


In [None]:
plt.figure(figsize=(20,8))
plt.title('Mean Rating per Category')
plt.grid()
plt.xlabel('Category')
plt.xticks(rotation=90)
plt.ylabel('Avg Rating')

d = df_playstore.groupby('Category')['Rating'].mean().reset_index()
plt.bar(d.Category, d.Rating)

#### Mean rating per category

In [None]:
dist = df_playstore['Free'].value_counts()
plt.figure(figsize=(10,10))
plt.title('Dist model')
plt.pie(dist, labels=dist.index, autopct='%1.1f%%', startangle=180);

#### So 95.5% of the apps are free to download.

In [None]:
plt.figure(figsize=(20,5))
price = df_playstore.Price[df_playstore.Price > 0].value_counts()
(price.head(50)).plot(kind = 'bar')

In [None]:
corr_matrix = df_playstore.corr().to_numpy()
df_playstore.corr()

In [None]:
fig1 = px.scatter(df_playstore, y='Rating', x='Price')
fig1.show()

In [None]:
fig2 = px.box(df_playstore, x='Category', y='Rating')
fig2.show()

In [None]:
fig3 = px.scatter(df_playstore, y='Rating Count', x=['Maximum Installs'])
fig3.show()

In [None]:
df_playstore['Editors Choice'].sum()

#### Only 735 apps are Editors' choice apps.

## Models

In [None]:
df_playstore_dummy = pd.get_dummies(df_playstore,columns=['Category','Content Rating'],drop_first=True)
# df_playstore_dummy = pd.get_dummies(df_playstore,columns=['Category'],drop_first=True)
print(df_playstore_dummy.shape)
df_playstore_dummy.head()

In [None]:
x_dummy = df_playstore_dummy.drop(columns=['Rating', 'App Name'],axis=1)
y_dummy = df_playstore_dummy.loc[:,'Rating']
print(x_dummy.shape, y_dummy.shape)
x_dummy.head()

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_dummy,y_dummy,test_size=0.25,random_state = 669)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
# Model traing and predicting
def dummy_model_building(model):
    
    model.fit(x_train,y_train)
    print('trained')
    train_score = model.score(x_train , y_train)
    test_score = model.score(x_test , y_test)
    predict = model.predict(x_test)

    print('Train Score on Dummy : {}'.format(train_score))
    print('Test Score on Dummy : {}'.format(test_score))
    print(f'MSE : {mean_squared_error(y_test, predict)}')
#   print(classification_report(y_test, predict))

    print('\n')

In [None]:
# DecisionTreeRegressor
dt = DecisionTreeRegressor()
dummy_model_building(dt)
print('\n')

#Tried hyperparameter tuning with max_depth and min_leaf_sample_weight but results remain similar.
#Seems a good model for now.

In [None]:
dt.get_depth()

In [None]:
#Linear Regression
le = LinearRegression()
dummy_model_building(le)
print('\n')

#Fail Model.

In [None]:
# Bagging Regressor
bc = BaggingRegressor()
dummy_model_building(bc)
print('\n')

In [None]:
# Gradient Boosting
gc = GradientBoostingRegressor()
dummy_model_building(gc)
print('\n')

In [None]:
# AdaBoosting
ac = AdaBoostRegressor()
dummy_model_building(ac)
print('\n')