In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_w = pd.read_csv(os.path.join(dirname, filename))

In [None]:
df_w

Import Necessary Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Lets get the basic info about our dataset

In [None]:
def basic_info(data):
    categorical = []
    numerical = []
    print("Size = ", data.size)
    print("Shape = ", data.shape)
    data.info()
    for i in data.columns:
        if data[i].dtype == object:
            categorical.append(i)
        else:
            numerical.append(i)
    return categorical, numerical

In [None]:
basic_info(df_w)

Lets make Date into Datetime category

In [None]:
df_w['Date'] = pd.to_datetime(df_w['Date'])

In [None]:
categorical, numerical = basic_info(df_w)

In [None]:
categorical

In [None]:
numerical

In [None]:
df_w.isnull().sum()

In [None]:
df_w = df_w.dropna()

In [None]:
basic_info(df_w)

# General Exploration

#### Lets see what we retrive from Rating

In [None]:
df_w['Rating'].value_counts()

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(df_w['Rating'], label = "Rating")
plt.legend()
plt.show()

We get that most of the apps have been rated 4.0

#### Lets see what info we can get from Category

In [None]:
df_w['Category'].value_counts()

lets see this representation in the form of a Countplot

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(df_w['Category'], label = "Category counts")
plt.xticks(rotation = -45)
plt.legend()
plt.show()

#### Lets see variation of rating in each category

In [None]:
plt.figure(figsize=(30,10))
sns.countplot(df_w['Rating'] ,hue = df_w['Category'])

#### Lets see what information we can derive from Price

In [None]:
df_w['Price'].value_counts()

We have to do some changes here again so that if i want to work with them. 1st Free will be converted to 0, 2nd "₹ " will be replaced by "" (no space) and the whole will be converted into float value

In [None]:
def change_price(x):
    if x == "Free":
        #print(x)
        x = 0.0
        return x
    else:
        #print(x)
        x = x[2:]
        x = x.replace(",", "")
        x = float(x)
        return x

In [None]:
df_w['Price'] = df_w['Price'].apply(lambda x: change_price(x))

In [None]:
df_w['Price'].dtype

In [None]:
cat, num = basic_info(df_w)

In [None]:
cat

In [None]:
num

Now we will try to implement a model which will try to predict the rating.

# Regression models for making Rating predictions

First we will try to vizualise all the numerical dtypes columns (other than Date), and plot them in box plots to get info about outliers

In [None]:
plt.figure(figsize=(20,13))
plt.style.use('seaborn-white')
ax = plt.subplot(221)
sns.boxplot(df_w['Rating'])
ax = plt.subplot(222)
sns.boxplot(df_w['Price'])
ax = plt.subplot(223)
sns.boxplot(df_w['No of people Rated'])

### Now I will try these two things which I think will be more flexible for calcualtion...
### If an app is free, it will remain 0.0, else it will be 1.0 depicting that the app isn't free
### Then I will create dummy of Category and with that a new dataframe so as to do the regression model

In [None]:
def free_or_not(x):
    if x == 0.0:
        return 0.0
    else:
        return 1.0

In [None]:
df_w['Price'] = df_w['Price'].apply(lambda x: free_or_not(x))

In [None]:
df_w['Price'].value_counts()

Lets make function which will return a new dataframe with dummy concatinated

In [None]:
def making_new_df(data, columnlist):
    for i in columnlist:
        dummy = pd.get_dummies(data[i])
        #print(dummy)
        del dummy[dummy.columns[-1]]
        data = pd.concat([data, dummy], axis = 1)
    return data

In [None]:
df_w2 = making_new_df(df_w, ['Category'])

In [None]:
df_w2

We won't be needing Name and Category and Date and Number of people Rated for regression model. I have eliminated Number of people Rated, because I tried it with Number of people and the performance was too weak

In [None]:
df_w2 = df_w2.drop(['Name', "Category", "Date", "No of people Rated"], axis = 1)

In [None]:
df_w2

Now our model is ready and we can go for train-test-split and making models. I will try to make linear regression, Ridge regressor,random forest regressor and SVM

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_w2.loc[:, df_w2.columns != 'Rating']
y = df_w2['Rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

#### Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
model = lr.fit(X_train, y_train)

In [None]:
y_predict = model.predict(X_test)

In [None]:
predict_dataframe = pd.DataFrame(data={"Predicted": y_predict, "Actual": y_test})

In [None]:
predict_dataframe

In [None]:
model.score(X_test, y_test)

In [None]:
plt.plot(predict_dataframe["Predicted"][:20], "*")
plt.plot(predict_dataframe['Actual'][:20], "^")
plt.show()

Wow, we can surely see that Linear regression didn't perform well at all

#### Lets try Ridge regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

In [None]:
ridge = Ridge()

In [None]:
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}
ridge_regressor = GridSearchCV(ridge, parameters, scoring = 'neg_mean_squared_error', cv =5)

In [None]:
modelR = ridge_regressor.fit(X_train, y_train)

In [None]:
y_predict_R = modelR.predict(X_test)

In [None]:
rigid_df = pd.DataFrame(data = {"Predicted": y_predict_R, "Actual": y_test})

In [None]:
rigid_df

In [None]:
modelR.score(X_test, y_test)

More poor than Linear

#### Lets try Support Vector Regression

In [None]:
from sklearn.svm import SVR

In [None]:
svr = SVR()

In [None]:
model_svr = svr.fit(X_train, y_train)

In [None]:
y_predict_svr = model_svr.predict(X_test)

In [None]:
svr_df = pd.DataFrame(data = {"Predicted": y_predict_svr, "Actual": y_test})

In [None]:
svr_df

In [None]:
model_svr.score(X_test, y_test)

At least good than Ridge

#### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor()

In [None]:
modef_rfr = rfr.fit(X_train, y_train)

In [None]:
y_predict_rfr = modef_rfr.predict(X_test)

In [None]:
rfr_df = pd.DataFrame(data={"Predicted": y_predict_rfr, "Actual": y_test})

In [None]:
rfr_df

In [None]:
modef_rfr.score(X_test, y_test)

In [None]:
plt.plot(rfr_df["Predicted"][:20], "*")
plt.plot(rfr_df['Actual'][:20], "^")
plt.show()

### The scores are:

In [None]:
print("Linear Regression score is: ", model.score(X_test, y_test))
print("Ridge Regression score is: ", modelR.score(X_test, y_test))
print("Support Vector Regression score is: ", model_svr.score(X_test, y_test))
print("Random Forest Regression score is: ", modef_rfr.score(X_test, y_test))

#### I know all the socres aren't good but still I just wanted to show how you can implement them. If you have any other tips to increase the score, do tell. I would love to hear and implement them

# Further Exploration

#### Since we have converted Price into 1 and 0 form where 0 implies the app is free, and 1 means the app is paid, now we will see this in a pie chart representation

In [None]:
fig, ax=plt.subplots()
plt.rcParams['text.color'] = 'black'
labels = ['Free', "Paid"]
sizes = df_w['Price'].value_counts()
percent = 100*np.array(sizes)/np.array(sizes).sum() #to show % of every category
patches, texts = ax.pie(sizes, shadow=True, startangle=90)
labels = ['{0} - {1:1.2f}%'.format(i,j) for i, j in zip(labels,percent)]
sort_legend = False
ax.axis('equal')

plt.legend(patches, labels, loc= 'best', bbox_to_anchor=(-0.1, 1.), fontsize = 10)
ax.axis('equal')
plt.show()

We see that only approx only 3% of the apps are paid. And rest are free

In [None]:
df_w['Date'].dt.year.unique() # use dt to use attributes such as year, month and more

#### Lets try plotting a graph where we can see in which year, No of people Rated

In [None]:
unique_dates = df_w['Date'].dt.year.unique()
#df_new = df_w[df_w['Date'].dt.year == 2014]
#df_new
unique_dates.sort()
print(unique_dates)
sum_array = list()
for i in unique_dates:
    df_new = df_w[df_w['Date'].dt.year == i]
    sum_array.append(sum(df_new['No of people Rated']))

print(sum_array)

plt.figure(figsize = (20,8))
plt.style.use('seaborn-darkgrid')

plt.plot(unique_dates, sum_array)
plt.xlabel("Years")
plt.ylabel("No of People Rated (k)")
for x, y in zip(unique_dates, sum_array):
    plt.text(x, y, str(y))