In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [22]:
df = pd.read_csv(r"data\outlier_handled_playstore_data.csv")
df.head()

Unnamed: 0,Rating,Reviews,Size,Installs,Price,App,Category,Type,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,4.1,159.0,19456.0,10000.0,0.0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,Free,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,3.9,967.0,14336.0,500000.0,0.0,Coloring book moana,ART_AND_DESIGN,Free,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,4.7,87510.0,8908.8,5000000.0,0.0,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,Free,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
3,4.5,136881.75,25600.0,12498500.0,0.0,Sketch - Draw & Paint,ART_AND_DESIGN,Free,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up
4,4.3,967.0,2867.2,100000.0,0.0,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,Free,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up


### **Drop the columns that are not useful for regression model**

In [23]:
df.columns

Index(['Rating', 'Reviews', 'Size', 'Installs', 'Price', 'App', 'Category',
       'Type', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [24]:
df.drop(columns=['Current Ver', 'Android Ver', 'App', 'Last Updated'], inplace=True)

### **Encoding categorical features**

In [25]:
df = pd.get_dummies(df, columns=['Type', 'Content Rating'], drop_first=True, dtype="int")

In [26]:
df.head()

Unnamed: 0,Rating,Reviews,Size,Installs,Price,Category,Genres,Type_Paid,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,4.1,159.0,19456.0,10000.0,0.0,ART_AND_DESIGN,Art & Design,0,1,0,0,0,0
1,3.9,967.0,14336.0,500000.0,0.0,ART_AND_DESIGN,Art & Design;Pretend Play,0,1,0,0,0,0
2,4.7,87510.0,8908.8,5000000.0,0.0,ART_AND_DESIGN,Art & Design,0,1,0,0,0,0
3,4.5,136881.75,25600.0,12498500.0,0.0,ART_AND_DESIGN,Art & Design,0,0,0,0,1,0
4,4.3,967.0,2867.2,100000.0,0.0,ART_AND_DESIGN,Art & Design;Creativity,0,1,0,0,0,0


In [27]:
df['Type_Paid'].unique()

array([0, 1])

### **Splitting mathematical features in X**

In [29]:
X = df.drop(columns=['Category', 'Rating', 'Genres'], axis=1)
X

Unnamed: 0,Reviews,Size,Installs,Price,Type_Paid,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,159.00,19456.0,10000.0,0.0,0,1,0,0,0,0
1,967.00,14336.0,500000.0,0.0,0,1,0,0,0,0
2,87510.00,8908.8,5000000.0,0.0,0,1,0,0,0,0
3,136881.75,25600.0,12498500.0,0.0,0,0,0,0,1,0
4,967.00,2867.2,100000.0,0.0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
10835,38.00,54272.0,5000.0,0.0,0,1,0,0,0,0
10836,4.00,3686.4,100.0,0.0,0,1,0,0,0,0
10837,3.00,9728.0,1000.0,0.0,0,1,0,0,0,0
10838,114.00,4096.0,1000.0,0.0,0,0,0,1,0,0


In [30]:
y = df['Rating']
y

0        4.1
1        3.9
2        4.7
3        4.5
4        4.3
        ... 
10835    4.5
10836    5.0
10837    4.6
10838    4.5
10839    4.5
Name: Rating, Length: 10840, dtype: float64

### **Split data X & y in training & test data**

In [31]:
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((7588, 10), (3252, 10), (7588,), (3252,))

### **Scaling the training data (fitting the parameteres & transforming the data)**

In [33]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### **Random Forest Regressor**

In [34]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators= 10, random_state= 42)
rf_model.fit(X_train, y_train)

In [35]:
y_pred_rf = rf_model.predict(X_test)