In [1]:
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.model_selection import train_test_split
  

In [2]:
# Read csv file into DataFrame
from operator import index


df = pd.read_csv("Resources/Google-Playstore.csv")
df.head()

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,Developer Website,Developer Email,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice,Scraped Time
0,Gakondo,com.ishakwe.gakondo,Adventure,0.0,0.0,10+,10.0,15,True,0.0,...,https://beniyizibyose.tk/#/,jean21101999@gmail.com,"Feb 26, 2020","Feb 26, 2020",Everyone,https://beniyizibyose.tk/projects/,False,False,False,2021-06-15 20:19:35
1,Ampere Battery Info,com.webserveis.batteryinfo,Tools,4.4,64.0,"5,000+",5000.0,7662,True,0.0,...,https://webserveis.netlify.app/,webserveis@gmail.com,"May 21, 2020","May 06, 2021",Everyone,https://dev4phones.wordpress.com/licencia-de-uso/,True,False,False,2021-06-15 20:19:35
2,Vibook,com.doantiepvien.crm,Productivity,0.0,0.0,50+,50.0,58,True,0.0,...,,vnacrewit@gmail.com,"Aug 9, 2019","Aug 19, 2019",Everyone,https://www.vietnamairlines.com/vn/en/terms-an...,False,False,False,2021-06-15 20:19:35
3,Smart City Trichy Public Service Vehicles 17UC...,cst.stJoseph.ug17ucs548,Communication,5.0,5.0,10+,10.0,19,True,0.0,...,http://www.climatesmarttech.com/,climatesmarttech2@gmail.com,"Sep 10, 2018","Oct 13, 2018",Everyone,,True,False,False,2021-06-15 20:19:35
4,GROW.me,com.horodyski.grower,Tools,0.0,0.0,100+,100.0,478,True,0.0,...,http://www.horodyski.com.pl,rmilekhorodyski@gmail.com,"Feb 21, 2020","Nov 12, 2018",Everyone,http://www.horodyski.com.pl,False,False,False,2021-06-15 20:19:35


In [3]:
# Drop unnessasery columns
ml_df = df.drop(columns=["App Name", "App Id", "Currency", "Minimum Android", "Developer Id", "Developer Website", "Developer Email", 
"Released", "Last Updated", "Privacy Policy", "Scraped Time"])
ml_df.head()

Unnamed: 0,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,Size,Content Rating,Ad Supported,In App Purchases,Editors Choice
0,Adventure,0.0,0.0,10+,10.0,15,True,0.0,10M,Everyone,False,False,False
1,Tools,4.4,64.0,"5,000+",5000.0,7662,True,0.0,2.9M,Everyone,True,False,False
2,Productivity,0.0,0.0,50+,50.0,58,True,0.0,3.7M,Everyone,False,False,False
3,Communication,5.0,5.0,10+,10.0,19,True,0.0,1.8M,Everyone,True,False,False
4,Tools,0.0,0.0,100+,100.0,478,True,0.0,6.2M,Everyone,False,False,False


In [4]:
ml_df = ml_df[ml_df["Installs"].notna()]
ml_df = ml_df[ml_df["Size"].notna()]
ml_df = ml_df[ml_df["Rating"].notna()]
ml_df = ml_df[ml_df["Rating Count"].notna()]


In [5]:
ml_df.dtypes

Category             object
Rating              float64
Rating Count        float64
Installs             object
Minimum Installs    float64
Maximum Installs      int64
Free                   bool
Price               float64
Size                 object
Content Rating       object
Ad Supported           bool
In App Purchases       bool
Editors Choice         bool
dtype: object

In [6]:
ml_df = ml_df.drop_duplicates()
print(F"Duplicates: {ml_df.duplicated().sum()}")

Duplicates: 0


In [7]:
ml_df.isnull().sum()

Category            0
Rating              0
Rating Count        0
Installs            0
Minimum Installs    0
Maximum Installs    0
Free                0
Price               0
Size                0
Content Rating      0
Ad Supported        0
In App Purchases    0
Editors Choice      0
dtype: int64

In [8]:
# Replace the spaces in the column names with underscores
ml_df.columns = [c.replace(' ', '_') for c in ml_df.columns]

In [9]:
def value_to_float(x):

    x = str(x).strip().replace(',', '').replace('Varies with device','0')

    if 'M' in str(x):
        x = x.replace('M', '') 

    if 'k' in str(x):
        x = x.replace('k', '')
        x = float(x) * 0.0009765625 

    if 'G' in str(x):
        x = str(x).replace('G', '')
        x = float(x) * 1024

    return float(x)

ml_df["Size"] = ml_df["Size"].apply(value_to_float)
ml_df.head()

Unnamed: 0,Category,Rating,Rating_Count,Installs,Minimum_Installs,Maximum_Installs,Free,Price,Size,Content_Rating,Ad_Supported,In_App_Purchases,Editors_Choice
0,Adventure,0.0,0.0,10+,10.0,15,True,0.0,10.0,Everyone,False,False,False
1,Tools,4.4,64.0,"5,000+",5000.0,7662,True,0.0,2.9,Everyone,True,False,False
2,Productivity,0.0,0.0,50+,50.0,58,True,0.0,3.7,Everyone,False,False,False
3,Communication,5.0,5.0,10+,10.0,19,True,0.0,1.8,Everyone,True,False,False
4,Tools,0.0,0.0,100+,100.0,478,True,0.0,6.2,Everyone,False,False,False


In [10]:
ml_df["Size"].dtypes

dtype('float64')

In [11]:
def object_to_int(x):

    x = str(x).strip().replace('+', '')

    if '+' in str(x):
        x = x.replace('+', '') 
    if ',' in str(x):
        x = x.replace(',', '')
    
    return int(x)

ml_df["Installs"] = ml_df["Installs"].apply(object_to_int)
ml_df.head()



Unnamed: 0,Category,Rating,Rating_Count,Installs,Minimum_Installs,Maximum_Installs,Free,Price,Size,Content_Rating,Ad_Supported,In_App_Purchases,Editors_Choice
0,Adventure,0.0,0.0,10,10.0,15,True,0.0,10.0,Everyone,False,False,False
1,Tools,4.4,64.0,5000,5000.0,7662,True,0.0,2.9,Everyone,True,False,False
2,Productivity,0.0,0.0,50,50.0,58,True,0.0,3.7,Everyone,False,False,False
3,Communication,5.0,5.0,10,10.0,19,True,0.0,1.8,Everyone,True,False,False
4,Tools,0.0,0.0,100,100.0,478,True,0.0,6.2,Everyone,False,False,False


In [12]:
ml_df["Installs"].dtypes

dtype('int64')

In [13]:
# Print out the Category value counts
category_counts = ml_df.Category.value_counts()
category_counts

Education                  200806
Music & Audio              132719
Entertainment              127939
Tools                      126761
Books & Reference          106689
Business                   100654
Lifestyle                   97379
Personalization             84433
Productivity                67201
Health & Fitness            66527
Shopping                    60096
Finance                     59556
Travel & Local              58872
Food & Drink                54360
Puzzle                      47017
Arcade                      46391
Sports                      44699
Casual                      44379
Communication               44096
Social                      42236
News & Magazines            40949
Photography                 34391
Medical                     29137
Action                      26037
Maps & Navigation           24937
Simulation                  22664
Adventure                   22111
Educational                 20533
Art & Design                17749
Auto & Vehicle

In [14]:
# Determine which values to replace
replace_categories = list(category_counts[category_counts < 25000].index)

# Replace in DataFrame
for category in replace_categories:
    ml_df.Category = ml_df.Category.replace(category,"Other")


# Check to make sure binning was successful
ml_df.Category.value_counts()

Other                275056
Education            200806
Music & Audio        132719
Entertainment        127939
Tools                126761
Books & Reference    106689
Business             100654
Lifestyle             97379
Personalization       84433
Productivity          67201
Health & Fitness      66527
Shopping              60096
Finance               59556
Travel & Local        58872
Food & Drink          54360
Puzzle                47017
Arcade                46391
Sports                44699
Casual                44379
Communication         44096
Social                42236
News & Magazines      40949
Photography           34391
Medical               29137
Action                26037
Name: Category, dtype: int64

In [15]:
# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(ml_df.Category.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(['Category'])
encode_df.head()



Unnamed: 0,Category_Action,Category_Arcade,Category_Books & Reference,Category_Business,Category_Casual,Category_Communication,Category_Education,Category_Entertainment,Category_Finance,Category_Food & Drink,...,Category_Other,Category_Personalization,Category_Photography,Category_Productivity,Category_Puzzle,Category_Shopping,Category_Social,Category_Sports,Category_Tools,Category_Travel & Local
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
# Merge the two DataFrames together and drop the Country column
merge_ml_df = ml_df.merge(encode_df,left_index=True,right_index=True).drop("Category",1)
merge_ml_df.head()

  merge_ml_df = ml_df.merge(encode_df,left_index=True,right_index=True).drop("Category",1)


Unnamed: 0,Rating,Rating_Count,Installs,Minimum_Installs,Maximum_Installs,Free,Price,Size,Content_Rating,Ad_Supported,...,Category_Other,Category_Personalization,Category_Photography,Category_Productivity,Category_Puzzle,Category_Shopping,Category_Social,Category_Sports,Category_Tools,Category_Travel & Local
0,0.0,0.0,10,10.0,15,True,0.0,10.0,Everyone,False,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.4,64.0,5000,5000.0,7662,True,0.0,2.9,Everyone,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,50,50.0,58,True,0.0,3.7,Everyone,False,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,5.0,10,10.0,19,True,0.0,1.8,Everyone,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,100,100.0,478,True,0.0,6.2,Everyone,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [17]:
# Print out the Category value counts
content_ratiing_counts = merge_ml_df.Content_Rating.value_counts()
content_ratiing_counts

Everyone           1541893
Teen                155017
Mature 17+           50761
Everyone 10+         28297
Unrated                124
Adults only 18+        120
Name: Content_Rating, dtype: int64

In [18]:
merge_ml_df = pd.get_dummies(merge_ml_df, columns=["Content_Rating"])
merge_ml_df.head()

Unnamed: 0,Rating,Rating_Count,Installs,Minimum_Installs,Maximum_Installs,Free,Price,Size,Ad_Supported,In_App_Purchases,...,Category_Social,Category_Sports,Category_Tools,Category_Travel & Local,Content_Rating_Adults only 18+,Content_Rating_Everyone,Content_Rating_Everyone 10+,Content_Rating_Mature 17+,Content_Rating_Teen,Content_Rating_Unrated
0,0.0,0.0,10,10.0,15,True,0.0,10.0,False,False,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
1,4.4,64.0,5000,5000.0,7662,True,0.0,2.9,True,False,...,0.0,0.0,1.0,0.0,0,1,0,0,0,0
2,0.0,0.0,50,50.0,58,True,0.0,3.7,False,False,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
3,5.0,5.0,10,10.0,19,True,0.0,1.8,True,False,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
4,0.0,0.0,100,100.0,478,True,0.0,6.2,False,False,...,0.0,0.0,1.0,0.0,0,1,0,0,0,0


In [19]:
merge_ml_df = pd.get_dummies(merge_ml_df, columns=["Free"])
merge_ml_df.head()

Unnamed: 0,Rating,Rating_Count,Installs,Minimum_Installs,Maximum_Installs,Price,Size,Ad_Supported,In_App_Purchases,Editors_Choice,...,Category_Tools,Category_Travel & Local,Content_Rating_Adults only 18+,Content_Rating_Everyone,Content_Rating_Everyone 10+,Content_Rating_Mature 17+,Content_Rating_Teen,Content_Rating_Unrated,Free_False,Free_True
0,0.0,0.0,10,10.0,15,0.0,10.0,False,False,False,...,0.0,0.0,0,1,0,0,0,0,0,1
1,4.4,64.0,5000,5000.0,7662,0.0,2.9,True,False,False,...,1.0,0.0,0,1,0,0,0,0,0,1
2,0.0,0.0,50,50.0,58,0.0,3.7,False,False,False,...,0.0,0.0,0,1,0,0,0,0,0,1
3,5.0,5.0,10,10.0,19,0.0,1.8,True,False,False,...,0.0,0.0,0,1,0,0,0,0,0,1
4,0.0,0.0,100,100.0,478,0.0,6.2,False,False,False,...,1.0,0.0,0,1,0,0,0,0,0,1


In [20]:
merge_ml_df = pd.get_dummies(merge_ml_df, columns=["Ad_Supported"])
merge_ml_df.head()

Unnamed: 0,Rating,Rating_Count,Installs,Minimum_Installs,Maximum_Installs,Price,Size,In_App_Purchases,Editors_Choice,Category_Action,...,Content_Rating_Adults only 18+,Content_Rating_Everyone,Content_Rating_Everyone 10+,Content_Rating_Mature 17+,Content_Rating_Teen,Content_Rating_Unrated,Free_False,Free_True,Ad_Supported_False,Ad_Supported_True
0,0.0,0.0,10,10.0,15,0.0,10.0,False,False,0.0,...,0,1,0,0,0,0,0,1,1,0
1,4.4,64.0,5000,5000.0,7662,0.0,2.9,False,False,0.0,...,0,1,0,0,0,0,0,1,0,1
2,0.0,0.0,50,50.0,58,0.0,3.7,False,False,0.0,...,0,1,0,0,0,0,0,1,1,0
3,5.0,5.0,10,10.0,19,0.0,1.8,False,False,0.0,...,0,1,0,0,0,0,0,1,0,1
4,0.0,0.0,100,100.0,478,0.0,6.2,False,False,0.0,...,0,1,0,0,0,0,0,1,1,0


In [21]:
merge_ml_df = pd.get_dummies(merge_ml_df, columns=["In_App_Purchases"])
merge_ml_df.head()

Unnamed: 0,Rating,Rating_Count,Installs,Minimum_Installs,Maximum_Installs,Price,Size,Editors_Choice,Category_Action,Category_Arcade,...,Content_Rating_Everyone 10+,Content_Rating_Mature 17+,Content_Rating_Teen,Content_Rating_Unrated,Free_False,Free_True,Ad_Supported_False,Ad_Supported_True,In_App_Purchases_False,In_App_Purchases_True
0,0.0,0.0,10,10.0,15,0.0,10.0,False,0.0,0.0,...,0,0,0,0,0,1,1,0,1,0
1,4.4,64.0,5000,5000.0,7662,0.0,2.9,False,0.0,0.0,...,0,0,0,0,0,1,0,1,1,0
2,0.0,0.0,50,50.0,58,0.0,3.7,False,0.0,0.0,...,0,0,0,0,0,1,1,0,1,0
3,5.0,5.0,10,10.0,19,0.0,1.8,False,0.0,0.0,...,0,0,0,0,0,1,0,1,1,0
4,0.0,0.0,100,100.0,478,0.0,6.2,False,0.0,0.0,...,0,0,0,0,0,1,1,0,1,0


In [22]:
merge_ml_df = pd.get_dummies(merge_ml_df, columns=["Editors_Choice"])
merge_ml_df.head()

Unnamed: 0,Rating,Rating_Count,Installs,Minimum_Installs,Maximum_Installs,Price,Size,Category_Action,Category_Arcade,Category_Books & Reference,...,Content_Rating_Teen,Content_Rating_Unrated,Free_False,Free_True,Ad_Supported_False,Ad_Supported_True,In_App_Purchases_False,In_App_Purchases_True,Editors_Choice_False,Editors_Choice_True
0,0.0,0.0,10,10.0,15,0.0,10.0,0.0,0.0,0.0,...,0,0,0,1,1,0,1,0,1,0
1,4.4,64.0,5000,5000.0,7662,0.0,2.9,0.0,0.0,0.0,...,0,0,0,1,0,1,1,0,1,0
2,0.0,0.0,50,50.0,58,0.0,3.7,0.0,0.0,0.0,...,0,0,0,1,1,0,1,0,1,0
3,5.0,5.0,10,10.0,19,0.0,1.8,0.0,0.0,0.0,...,0,0,0,1,0,1,1,0,1,0
4,0.0,0.0,100,100.0,478,0.0,6.2,0.0,0.0,0.0,...,0,0,0,1,1,0,1,0,1,0


In [23]:
# Define the features set.
X = merge_ml_df.copy()
X = X.drop("Maximum_Installs", axis=1)
X.head()

Unnamed: 0,Rating,Rating_Count,Installs,Minimum_Installs,Price,Size,Category_Action,Category_Arcade,Category_Books & Reference,Category_Business,...,Content_Rating_Teen,Content_Rating_Unrated,Free_False,Free_True,Ad_Supported_False,Ad_Supported_True,In_App_Purchases_False,In_App_Purchases_True,Editors_Choice_False,Editors_Choice_True
0,0.0,0.0,10,10.0,0.0,10.0,0.0,0.0,0.0,0.0,...,0,0,0,1,1,0,1,0,1,0
1,4.4,64.0,5000,5000.0,0.0,2.9,0.0,0.0,0.0,0.0,...,0,0,0,1,0,1,1,0,1,0
2,0.0,0.0,50,50.0,0.0,3.7,0.0,0.0,0.0,0.0,...,0,0,0,1,1,0,1,0,1,0
3,5.0,5.0,10,10.0,0.0,1.8,0.0,0.0,0.0,0.0,...,0,0,0,1,0,1,1,0,1,0
4,0.0,0.0,100,100.0,0.0,6.2,0.0,0.0,0.0,0.0,...,0,0,0,1,1,0,1,0,1,0


In [24]:
X.dtypes

Rating                            float64
Rating_Count                      float64
Installs                            int64
Minimum_Installs                  float64
Price                             float64
Size                              float64
Category_Action                   float64
Category_Arcade                   float64
Category_Books & Reference        float64
Category_Business                 float64
Category_Casual                   float64
Category_Communication            float64
Category_Education                float64
Category_Entertainment            float64
Category_Finance                  float64
Category_Food & Drink             float64
Category_Health & Fitness         float64
Category_Lifestyle                float64
Category_Medical                  float64
Category_Music & Audio            float64
Category_News & Magazines         float64
Category_Other                    float64
Category_Personalization          float64
Category_Photography              

In [25]:
# Define the target set.
y = merge_ml_df["Maximum_Installs"].ravel()
y[:5]

array([  15, 7662,   58,   19,  478])

In [26]:
# Splitting the data into training and testing set and making predictions
x_train, x_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 0)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(x_train, y_train)
pred = lm.predict(x_test)

In [27]:
#pred.reshape(605514,1)

ValueError: cannot reshape array of size 532864 into shape (605514,1)

In [28]:
# Display the list of columns
for col in X.columns:
    print(col)

Rating
Rating_Count
Installs
Minimum_Installs
Price
Size
Category_Action
Category_Arcade
Category_Books & Reference
Category_Business
Category_Casual
Category_Communication
Category_Education
Category_Entertainment
Category_Finance
Category_Food & Drink
Category_Health & Fitness
Category_Lifestyle
Category_Medical
Category_Music & Audio
Category_News & Magazines
Category_Other
Category_Personalization
Category_Photography
Category_Productivity
Category_Puzzle
Category_Shopping
Category_Social
Category_Sports
Category_Tools
Category_Travel & Local
Content_Rating_Adults only 18+
Content_Rating_Everyone
Content_Rating_Everyone 10+
Content_Rating_Mature 17+
Content_Rating_Teen
Content_Rating_Unrated
Free_False
Free_True
Ad_Supported_False
Ad_Supported_True
In_App_Purchases_False
In_App_Purchases_True
Editors_Choice_False
Editors_Choice_True


In [31]:
import statsmodels.regression.linear_model as sm
# add a column of ones as integer data type
X = np.append(arr = np.ones((1776212, 1)).astype(int), 
              values = X, axis = 1)
# choose a Significance level usually 0.05, if p>0.05
#  for the highest values parameter, remove that value
x_opt = X[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]]
ols = sm.OLS(endog = y, exog = x_opt).fit()
ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.911
Model:,OLS,Adj. R-squared:,0.911
Method:,Least Squares,F-statistic:,476300.0
Date:,"Mon, 07 Nov 2022",Prob (F-statistic):,0.0
Time:,17:02:31,Log-Likelihood:,-30492000.0
No. Observations:,1776212,AIC:,60980000.0
Df Residuals:,1776173,BIC:,60980000.0
Df Model:,38,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.606e+05,1.1e+05,3.280,0.001,1.45e+05,5.76e+05
x1,9008.2690,2532.781,3.557,0.000,4044.107,1.4e+04
x2,6.0989,0.028,214.208,0.000,6.043,6.155
x3,0.7718,0.001,1420.676,0.000,0.771,0.773
x4,0.7686,0.001,1421.875,0.000,0.768,0.770
x5,47.2021,1906.462,0.025,0.980,-3689.398,3783.802
x6,-217.2179,218.618,-0.994,0.320,-645.701,211.265
x7,4.388e+04,4.44e+04,0.989,0.323,-4.31e+04,1.31e+05
x8,-5577.7714,3.37e+04,-0.166,0.868,-7.15e+04,6.04e+04

0,1,2,3
Omnibus:,7778622.43,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,853736392924804.8
Skew:,135.399,Prob(JB):,0.0
Kurtosis:,107406.575,Cond. No.,2.42e+16
