# Make a model to predict the app rating, with other information about the app provided.

In [None]:
# import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load the data file using pandas. 

In [None]:
df = pd.read_csv('googleplaystore.csv')

In [None]:
df

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

 # 2. Check for null values in the data. Get the number of null values for each column.


In [None]:
df.info()

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

# 3. Drop records with nulls in any of the columns.

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

# 4. Variables seem to have incorrect type and inconsistent formatting. You need to fix them: 

In [None]:
df.info()

# 4.1 Size column has sizes in Kb as well as Mb. To analyze, you’ll need to convert these to numeric.

# Extract the numeric value from the column

# Multiply the value by 1,000, if size is mentioned in Mb

In [None]:
def convert_to_float(x):
    if type(x) == int or type(x) == float:
        return x
    if 'k' in x:
        if len(x) > 1:
            return float(x.replace('k', ''))
        return 1.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000
        return 1000.0
    if 'e' in x:
        return 0.0

df['Size'] = df['Size'].apply(convert_to_float)

In [None]:
df.Size.describe()

# 4.2 Reviews is a numeric field that is loaded as a string field. Convert it to numeric (int/float).

In [None]:
df['Reviews'] = df['Reviews'].astype(int)

In [None]:
df['Last Updated'] = df['Last Updated'].astype('datetime64[ns]')

In [None]:
#check size column values if they are any inconsistency
df['fal'] = df['Size'].str.strip().str[-1]
df['fal'].unique()

In [None]:
df = df.drop("fal",axis=1)

In [None]:
df.tail(5)

# 4.3 Installs field is currently stored as string and has values like 1,000,000+. 

# Treat 1,000,000+ as 1,000,000

# remove ‘+’, ‘,’ from the field, convert it to integer

In [None]:
df['Installs'].value_counts()

In [None]:
def rep(value):
    return int(value.replace(",","").replace("+",""))
df.Installs = df.Installs.map(rep)
df

In [None]:
df.head()

In [None]:
df.Installs.describe()

# 4.4 Price field is a string and has $ symbol. Remove ‘$’ sign, and convert it to numeric.

In [None]:
df.Price.describe()

In [None]:
df['Price'] = df['Price'].str.replace('$','', regex=False)
df['Price'] = df['Price'].astype(float)

In [None]:
df.head()

# 5  Sanity checks:

# 5.1 Average rating should be between 1 and 5 as only these values are allowed on the play store. Drop the rows that have a value outside this range.

In [None]:
df.Rating.describe()

In [None]:
df1 = df[(df['Rating'] > 5.0) & (df['Rating'] < 1.0)].index
df.drop(df1, inplace = True)
df.head(10)

# 5.2 Reviews should not be more than installs as only those who installed can review the app. If there are any such records, drop them.

In [None]:
df[df.Rating>df.Installs]

In [None]:
df = df[df.Reviews <= df.Installs].copy()

In [None]:
df.shape

# 5.3 For free apps (type = “Free”), the price should not be >0. Drop any such rows.

In [None]:
df = df[(df['Type'] == "Free") & (df['Price'] > 0)]
df.drop(df, inplace=True)

In [None]:
df.head(10)

# 5.5 Performing univariate analysis: 



In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
color = sns.color_palette()
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Boxplot for Price
sns.boxplot(df.Price)
plt.show()

In [None]:
df.boxplot(column = ['Price'])


In [None]:
# indeed there are some outliers in the Price column,i.e., there are some apps whose price is more than usual apps on the Googleplaystore

In [None]:
# Remove outliers
# easy way to remove outliers
def outliers (df,col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df= df.loc[~((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))),]
    return df

df = df.loc[df["Price"] < 200 ,]

In [None]:
# Boxplot for Reviews
df.boxplot(column = ['Reviews'])

In [None]:
# Indeed there are some apps that have very high number of Reviews

In [None]:
# Remove outliers
df = outliers(df,"Reviews")

In [None]:
#Histogram for Rating
plt.hist(df["Rating"])

In [None]:
# It's left skewed (negetively skewed) some apps seem to have higher Ratings than usual

In [None]:
#Histogram for Size
plt.hist(df["Size"])

In [None]:
# It's right skewed (positively skewed),

# Bivariate analysis (Scatter plots)


# Bivariate analysis: Let’s look at how the available predictors relate to the variable of interest, i.e., our target variable rating. Make scatter plots (for numeric features) and box plots (for character features) to assess the relations between rating and the other features.

In [None]:
#1) Scatter plot/jointplot for Rating Vs. Price
sns.scatterplot(x = 'Rating', y = 'Price',data=df)

In [None]:
sns.jointplot(x= 'Rating',y= 'Price',data= df)

# Both the plots show a positive linear relationship; as the price of an app increases its rating also increases. That states the paid apps have the highest of Ratings

In [None]:
#2) Scatterplot/jointplot for Rating Vs. Size
sns.scatterplot(x= 'Rating',y= 'Size', data= df)

In [None]:
sns.jointplot(x= 'Rating', y= 'Size', data= df)

In [None]:
# The plots show a positive linear relationship; as the Size increases the Ratings increases. This stats the heavier apps are rated better

In [None]:
#3) Scatterplot for Ratings Vs. Reviews
sns.scatterplot(x= 'Rating',y= 'Reviews', data= df)

In [None]:
# The plot shows a positive linear relationship between Ratings and Reviews. More reviews mean better ratings indeed

In [None]:
#4) Boxplot for Ratings Vs. Content Rating
sns.set(rc={'figure.figsize':(14,8)})
sns.boxplot(x= 'Rating', y= 'Content Rating', data = df)

In [None]:
# The above plot shows the apps for Everyone is worst rated as it contain the highest number of outliers followed by apps for Mature 17+ and Everyone 10+ along with Teen. The catergory Adults only 18+ is rated better and falls under most liked type

In [None]:
#5) Boxplot for Ratings Vs. Category
sns.set(rc={'figure.figsize':(18,12)})
sns.boxplot(x= 'Rating', y = 'Category', data= df)

In [None]:
# Data Processing

In [None]:
p1 = df.copy()

In [None]:
# Reviews and Installs column still have some relatively high values, before building the linear regression model we need to reduce the skew; columns needs log transformation

In [None]:
# Log transformation
import numpy as np
p1["Reviews"] = np.log(p1["Reviews"])
p1["Installs"] = np.log(p1["Installs"])

In [None]:
# delete unnecessary columns
p1.drop(["App","Last Updated","Current Ver","Android Ver","Type"],axis=1,inplace=True)

In [None]:
p1.head(2)

In [None]:
# As Model does not understand any Catergorical variable hence these need to be converted to numerical

# Dummy Encoding is one way to convert these columns into numerical

In [None]:
#3) create a copy of dataframe
p2 = p1
p2.head(5)

In [None]:
#get unique values in column category
p2['Category'].unique()

In [None]:
#Storing the column into x varible and delete the category col from dataframe p2
#And concat the encoded cols to the dataframe p2
def generate_dummies(df, col):
    df[col] = pd.Categorical(df[col])

    x = df[[col]]
    del df[col]

    dummies = pd.get_dummies(x, prefix=col)
    df = pd.concat([df, dummies], axis=1)
    return df

p2 = generate_dummies(p2, 'Category')


In [None]:
#get unique values in Column Genres
p2['Genres'].unique()

In [None]:
# There are too many categories under Genres. Hence, we will try to reduce some genres which have very few samples under them and put them under one new common category i.e. "Other"

In [None]:
# create a empty list
lists = []
#if a genre has count of less than 20 then add it to the list
for i in p2.Genres.value_counts().index:
    if p2.Genres.value_counts()[i]<20:
        lists.append(i)

p2.Genres = ['Other' if i in lists else i for i in p2.Genres]
p2['Genres'].unique()

In [None]:
p2 = generate_dummies(p2, 'Genres')

In [None]:
p2.head()

In [None]:
#getting the unique values in Column "Content Rating"
p2["Content Rating"].unique()

In [None]:
p2 = generate_dummies(p2, 'Content Rating')
p2.head()

In [None]:
# Train test split and apply 70-30 split. Name the new dataframes df_train and df_test.

# Separate the dataframes into X_train, y_train, X_test, and y_test

In [None]:
#importing the neccessary libraries from sklearn to split the data and and for model building
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn import metrics

In [None]:
#Creating the variable X and Y which contains the X features as independent features and Y is the target feature 
data2 = p2
X = data2.drop('Rating',axis=1)
y = data2['Rating']

#Dividing the X and y into test and train data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=5)

In [None]:
# Model Building & Evaluation
# Model building Use linear regression as the technique Report the R2 on the train set

In [None]:
#Create a linear reggression obj by calling the linear reggressor algorithm
lin_reggressor = LinearRegression()
lin_reggressor.fit(X_train,y_train)

In [None]:
R2_Score_train_data = round(lin_reggressor.score(X_train,y_train),3)
print("The R2 value of the Training Set is : {}".format(R2_Score_train_data))

In [None]:
# Make predictions on test set and report R2.

In [None]:
# test the output by changing values, like 3750
y_pred = lin_reggressor.predict(X_test)
R2_Score_test_data =metrics.r2_score(y_test,y_pred)
R2_Score_test_data

In [None]:
R2_Score_test_data = round(lin_reggressor.score(X_test,y_test),3)
print("The R2 value of the Test Set is : {}".format(R2_Score_test_data))