In [None]:
import pandas as pd
import numpy as np
import requests, io # for HTTP requests and I/O commands
import matplotlib.pyplot as plt # for data visualization %matplotlib inline

# scikit-learn modules
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.metrics import mean_squared_error # for calculating the cost function
from sklearn.ensemble import RandomForestRegressor # for building the model
from sklearn.linear_model import LinearRegression


: 

In [None]:
# Read csv file into DataFrame
from operator import index


df = pd.read_csv("Resources/Google-Playstore.csv")
df.head()

: 

In [None]:
# Drop unnessasery columns
ml_df = df.drop(columns=["App Name", "App Id", "Currency", "Minimum Android", "Developer Id", "Developer Website", "Developer Email", 
"Released", "Last Updated", "Privacy Policy", "Scraped Time", "Minimum Installs", "Category", "Installs"])
ml_df.head()

: 

In [None]:
# Check for NaNs
ml_df.isnull().sum()

: 

In [None]:
# Drop NaNs

ml_df = ml_df[ml_df["Size"].notna()]
ml_df = ml_df[ml_df["Rating"].notna()]
ml_df = ml_df[ml_df["Rating Count"].notna()]


: 

In [None]:
# Delete duplicates
ml_df = ml_df.drop_duplicates()
# Check if all duplicates dropped
print(F"Duplicates: {ml_df.duplicated().sum()}")

: 

In [None]:
# Check for data types
ml_df.dtypes

: 

In [None]:
# Replace the spaces in the column names with underscores
ml_df.columns = [c.replace(' ', '_') for c in ml_df.columns]

: 

In [None]:
# Remove prefix and transform column Size from string to float 

def value_to_float(x):

    x = str(x).strip().replace(',', '').replace('Varies with device','0')

    if 'M' in str(x):
        x = x.replace('M', '') 

    if 'k' in str(x):
        x = x.replace('k', '')
        x = float(x) * 0.0009765625 

    if 'G' in str(x):
        x = str(x).replace('G', '')
        x = float(x) * 1024

    return float(x)

ml_df["Size"] = ml_df["Size"].apply(value_to_float)
ml_df.head()

: 

In [None]:
# Check if it is float
ml_df["Size"].dtypes

: 

In [None]:
# Transform booleans to a numeric value
for x in ml_df.columns:
    
    if ml_df[x].dtype==bool:
        print(ml_df[x].dtype)
        ml_df[x] = ml_df[x].astype(int)

: 

In [None]:
# heck data types after transorming 
ml_df.dtypes

: 

In [None]:
# Check column Free for values
ml_df["Free"].value_counts()

: 

In [None]:
# Encode Content_Rating column 
merge_ml_df = pd.get_dummies(ml_df, columns=["Content_Rating"])
merge_ml_df.head()

: 

In [None]:
# Define the features set.
X = merge_ml_df.copy()
X = X.drop("Maximum_Installs", axis=1)
X.head()

: 

In [None]:
# Define the target.
y = merge_ml_df["Maximum_Installs"].ravel()
y[:5]

: 

In [None]:
# Splitting the dataset into training and testing set (80/20) and take only random 10000 columns so it will not crash the kernel when prossesing.
X_new = X[1:10000]
y_new = y[1:10000]
x_train, x_test, y_train, y_test = train_test_split(X_new, y_new, test_size = 0.2, random_state = 0)

: 

In [None]:
# Create dictionary with defined  models we will train our dataset.
all_ml = {
    "RandomFores_300": RandomForestRegressor(n_estimators = 300, random_state = 0),
    "LinearRegression":LinearRegression(),

}

# Create list with future results
all_res = []

: 

In [None]:
# Create a function which will train the model and print the results into a list we created.
for x in all_ml:
    model = all_ml[x]
    model.fit(x_train, y_train)
    all_res.append({
        "name": x,
        "train_score": model.score(x_train, y_train),
        "test_score": model.score(x_test, y_test),
    })

: 

In [None]:
# Print the results in a DataFrame.
res_df = pd.DataFrame(all_res)
res_df

: 

In [None]:
# Predicting the target values of the test set
y_pred = model.predict(x_test)

# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)


: 

In [None]:


features = sorted(zip(X_new.columns, model.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,20)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()


: 