## Setting up

In [1]:
#importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# We decided to configure out settings exactly like we did for A2

# Configure libraries
# The seaborn library makes plots look nicer
sns.set()
sns.set_context('talk')

# Don't display too many rows/cols of DataFrames
pd.options.display.max_rows = 7
pd.options.display.max_columns = 8

# Round decimals when displaying DataFrames
pd.set_option('precision', 2)

# Data Wrangling

We have two main files in our data set: googleplaystore.csv and googleplaystore_user_reviews.csv.

For an analysis of app names, we can focus on just the first data set. The second data set gives us reviews of each app, which is not needed for this project.

In [3]:
df = pd.read_csv("gpsa/googleplaystore.csv")

In [4]:
# Show the headers of the different columns
print(list(df.columns.values))

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [5]:
# Remove any duplicate applications
df.drop_duplicates(['App'], inplace = True)

In [6]:
# We decided that we only need the app names, ratings, number of installations, and category. 
df = df[['App', 'Rating', 'Installs','Category']]

The 'size' of apps, the 'type' (either free or paid), 'price', 'content rating' (as in rating for suggested age group), the date of when they were 'last updated', and the versions ('current ver' and 'android ver') have little to do with our analysis of title length, so those columns were dropped. We also decided to drop the 'genre' column since it is a less reliable variant of the 'category' column; apps can only belong in one category, but they can belong in multiple genres.

In [7]:
# Count amount of missing values in the dataset
app_nans = df['App'].isna().sum()
rating_nans = df['Rating'].isna().sum()
installs_nans = df['Installs'].isna().sum()
category_nans = df['Category'].isna().sum()
print(app_nans, rating_nans, installs_nans, category_nans)

0 1463 0 0


In [8]:
# The only column in our current dataset that has missing values is 'Ratings'.
# Since we absolutely need ratings values for every observation we have, we will drop all of the rows without them.
df = df.dropna()

In [9]:
# Preview the dataframe
df.head

<bound method NDFrame.head of                                                      App  Rating     Installs  \
0         Photo Editor & Candy Camera & Grid & ScrapBook     4.1      10,000+   
1                                    Coloring book moana     3.9     500,000+   
2      U Launcher Lite – FREE Live Cool Themes, Hide ...     4.7   5,000,000+   
...                                                  ...     ...          ...   
10837                   Fr. Mike Schmitz Audio Teachings     5.0         100+   
10839                      The SCP Foundation DB fr nn5n     4.5       1,000+   
10840      iHoroscope - 2018 Daily Horoscope & Astrology     4.5  10,000,000+   

                  Category  
0           ART_AND_DESIGN  
1           ART_AND_DESIGN  
2           ART_AND_DESIGN  
...                    ...  
10837               FAMILY  
10839  BOOKS_AND_REFERENCE  
10840            LIFESTYLE  

[8197 rows x 4 columns]>

In [10]:
"""
#It turns out that the reviews (number of) column is a column of strings
#Every time the number of reviews is in the millions, it is denoted with 'M'
#We need to fix all occurences of this to properly typecast

#We replace the instances of 'M' in the original column with E6
df['Reviews'] = df['Reviews'].replace({'M': 'E6'}, regex=True)
"""

"\n#It turns out that the reviews (number of) column is a column of strings\n#Every time the number of reviews is in the millions, it is denoted with 'M'\n#We need to fix all occurences of this to properly typecast\n\n#We replace the instances of 'M' in the original column with E6\ndf['Reviews'] = df['Reviews'].replace({'M': 'E6'}, regex=True)\n"

In [11]:
# It turns out that the 'Installs' column is a column of strings instead of integers

# Delete the '+' chars that are in these strings in 'Installs'
df['Installs'] = df['Installs'].str.replace('+', '')
# Delete the ',' chars that are in these strings in 'Installs'
df['Installs'] = df['Installs'].str.replace(',', '')

# Type cast these strings into integers
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

df.head

<bound method NDFrame.head of                                                      App  Rating  Installs  \
0         Photo Editor & Candy Camera & Grid & ScrapBook     4.1  1.00e+04   
1                                    Coloring book moana     3.9  5.00e+05   
2      U Launcher Lite – FREE Live Cool Themes, Hide ...     4.7  5.00e+06   
...                                                  ...     ...       ...   
10837                   Fr. Mike Schmitz Audio Teachings     5.0  1.00e+02   
10839                      The SCP Foundation DB fr nn5n     4.5  1.00e+03   
10840      iHoroscope - 2018 Daily Horoscope & Astrology     4.5  1.00e+07   

                  Category  
0           ART_AND_DESIGN  
1           ART_AND_DESIGN  
2           ART_AND_DESIGN  
...                    ...  
10837               FAMILY  
10839  BOOKS_AND_REFERENCE  
10840            LIFESTYLE  

[8197 rows x 4 columns]>

In [12]:
#The E6 allows us to convert the column of strings into a column of floats since
        #E6 is evaluated as *1000000
    
#df['Reviews'] = df['Reviews'].astype(float)

In [13]:
# Create a word count column 
df['Word Count'] = df['App'].str.split().str.len()

# TODO - We drop everything outside 2 standard deviations (after histograms)

In [14]:
df['Char Count'] = df['App'].str.len() - df['App'].str.count(' ')

In [15]:
#preview table again
df.head

<bound method NDFrame.head of                                                      App  Rating  Installs  \
0         Photo Editor & Candy Camera & Grid & ScrapBook     4.1  1.00e+04   
1                                    Coloring book moana     3.9  5.00e+05   
2      U Launcher Lite – FREE Live Cool Themes, Hide ...     4.7  5.00e+06   
...                                                  ...     ...       ...   
10837                   Fr. Mike Schmitz Audio Teachings     5.0  1.00e+02   
10839                      The SCP Foundation DB fr nn5n     4.5  1.00e+03   
10840      iHoroscope - 2018 Daily Horoscope & Astrology     4.5  1.00e+07   

                  Category  Word Count  Char Count  
0           ART_AND_DESIGN           9          38  
1           ART_AND_DESIGN           3          17  
2           ART_AND_DESIGN          10          41  
...                    ...         ...         ...  
10837               FAMILY           5          28  
10839  BOOKS_AND_REFERENCE 

In [16]:
#A### the code below creates many columns with binary variable
#A### e.g. values in column 'a' are equal to 1 if 'a' is contained in apps name and 0 otherwise
#A### or values in column 'wq' are equal to 1 if 'wq' is contained in apps name and 0 otherwise

#A### Then we count how many ones are in each of the created columns to assess if the column is useful
#A### If there are not a lot of ones in a column (less than 0.025 * (# of rows) for example)
#A### or too many (more than 0.975 * (# of rows) for example) we remove such a column

#A### it is done because, for example there are not a lot of titles with 'wq' inside them
#A### so 'wq' column is not useful for our analysis

'''
import string
alphabet = list(string.ascii_lowercase)

for k in alphabet:
    q1 = []
    for i in df['App']:
        if k in i:
            q1.append(1)
        else:
            q1.append(0)
    df[k] = q1

alphabet1 = []
for i in alphabet:
    for k in alphabet:
        alphabet1.append(i + k)
        
for k in alphabet1:
    q2 = []
    for i in df['App']:
        if k in i:
            q2.append(1)
        else:
            q2.append(0)
    df[k] = q2


rem_list = []                           ##here 13 and 715 are indexes of first and last created columns  
for i in range(13,715):                 ##it is possible when you will try to execute code you will need to insert proper values 
    if sum(df[df.columns[i]]) < 0.025*df.shape[0] or sum(df[df.columns[i]]) > 0.975*df.shape[0]:
        rem_list.append(i)


s = 0
for i in rem_list:
    i = i - s
    del df[df.columns[i]]
    s += 1
'''


"\nimport string\nalphabet = list(string.ascii_lowercase)\n\nfor k in alphabet:\n    q1 = []\n    for i in df['App']:\n        if k in i:\n            q1.append(1)\n        else:\n            q1.append(0)\n    df[k] = q1\n\nalphabet1 = []\nfor i in alphabet:\n    for k in alphabet:\n        alphabet1.append(i + k)\n        \nfor k in alphabet1:\n    q2 = []\n    for i in df['App']:\n        if k in i:\n            q2.append(1)\n        else:\n            q2.append(0)\n    df[k] = q2\n\n\nrem_list = []                           ##here 13 and 715 are indexes of first and last created columns  \nfor i in range(13,715):                 ##it is possible when you will try to execute code you will need to insert proper values \n    if sum(df[df.columns[i]]) < 0.025*df.shape[0] or sum(df[df.columns[i]]) > 0.975*df.shape[0]:\n        rem_list.append(i)\n\n\ns = 0\nfor i in rem_list:\n    i = i - s\n    del df[df.columns[i]]\n    s += 1\n"