In [71]:
#importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [72]:
# We decided to configure out settings exactly like we did for A2

# Configure libraries
# The seaborn library makes plots look nicer
sns.set()
sns.set_context('talk')

# Don't display too many rows/cols of DataFrames
pd.options.display.max_rows = 7
pd.options.display.max_columns = 8

# Round decimals when displaying DataFrames
pd.set_option('precision', 2)



We have two main files in our data set: googleplaystore.csv and googleplaystore_user_reviews.csv

For analysis of app names, we can focus on just the first data set.



In [73]:
df = pd.read_csv("gpsa/googleplaystore.csv")

In [74]:
#show the headers of the different columns
print(list(df.columns.values))

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [None]:
#A### HOW ABOUT REMAINING MORE COLUMNS IN ADDITION TO THE COLUMNS ALREADY SELECTED SUCH AS INSTALLS, TYPE, PRICE, CATEGORY??
#A### i THINK THEY COULD BE HELPFUL IN FUTHER ANALYSIS 
#A###
#A### df = df[['App', 'Rating', 'Reviews', 'Installs', 'Type', 'Price', 'Category']]

#A### also the I think it is useful to remain dataset without duplicates in column 'App'
#A### df.drop_duplicates(['App'], inplace = True)

In [75]:
#We only need the app names, ratings, reviews (number of) , and rating. 
df = df[['App', 'Rating', 'Reviews']]

In [76]:
#preview the dataframe
df.head

<bound method NDFrame.head of                                                      App  Rating Reviews
0         Photo Editor & Candy Camera & Grid & ScrapBook     4.1     159
1                                    Coloring book moana     3.9     967
2      U Launcher Lite – FREE Live Cool Themes, Hide ...     4.7   87510
...                                                  ...     ...     ...
10838                             Parkinson Exercices FR     NaN       3
10839                      The SCP Foundation DB fr nn5n     4.5     114
10840      iHoroscope - 2018 Daily Horoscope & Astrology     4.5  398307

[10841 rows x 3 columns]>

In [77]:
#It turns out that the reviews (number of) column is a column of strings
#Every time the number of reviews is in the millions, it is denoted with 'M'
#We need to fix all occurences of this to properly typecast

#We replace the instances of 'M' in the original column with E6
df['Reviews'] = df['Reviews'].replace({'M': 'E6'}, regex=True)

In [78]:
#The E6 allows us to convert the column of strings into a column of floats since
        #E6 is evaluated as *1000000
    
df['Reviews'] = df['Reviews'].astype(float)

In [79]:
#We drop everything outside 2 std

df['Word Count'] = df['App'].str.split().str.len()

In [80]:
df['Char Count'] = df['App'].str.len() - df['App'].str.count(' ')

In [82]:
#Remove any rows with null values
df = df.dropna(how='any')

In [83]:
#preview table again
df.head

<bound method NDFrame.head of                                                      App  Rating   Reviews  \
0         Photo Editor & Candy Camera & Grid & ScrapBook     4.1     159.0   
1                                    Coloring book moana     3.9     967.0   
2      U Launcher Lite – FREE Live Cool Themes, Hide ...     4.7   87510.0   
...                                                  ...     ...       ...   
10837                   Fr. Mike Schmitz Audio Teachings     5.0       4.0   
10839                      The SCP Foundation DB fr nn5n     4.5     114.0   
10840      iHoroscope - 2018 Daily Horoscope & Astrology     4.5  398307.0   

       Word Count  Char Count  
0               9          38  
1               3          17  
2              10          41  
...           ...         ...  
10837           5          28  
10839           6          24  
10840           7          39  

[9367 rows x 5 columns]>

In [None]:
#A### the code below creates many columns with binary variable
#A### e.g. values in column 'a' are equal to 1 if 'a' is contained in apps name and 0 otherwise
#A### or values in column 'wq' are equal to 1 if 'wq' is contained in apps name and 0 otherwise

#A### Then we count how many ones are in each of the created columns to assess if the column is useful
#A### If there are not a lot of ones in a column (less than 0.025 * (# of rows) for example)
#A### or too many (more than 0.975 * (# of rows) for example) we remove such a column

#A### it is done because, for example there are not a lot of titles with 'wq' inside them
#A### so 'wq' column is not useful for our analysis

'''
import string
alphabet = list(string.ascii_lowercase)

for k in alphabet:
    q1 = []
    for i in df['App']:
        if k in i:
            q1.append(1)
        else:
            q1.append(0)
    df[k] = q1

alphabet1 = []
for i in alphabet:
    for k in alphabet:
        alphabet1.append(i + k)
        
for k in alphabet1:
    q2 = []
    for i in df['App']:
        if k in i:
            q2.append(1)
        else:
            q2.append(0)
    df[k] = q2


rem_list = []                           ##here 13 and 715 are indexes of first and last created columns  
for i in range(13,715):                 ##it is possible when you will try to execute code you will need to insert proper values 
    if sum(df[df.columns[i]]) < 0.025*df.shape[0] or sum(df[df.columns[i]]) > 0.975*df.shape[0]:
        rem_list.append(i)


s = 0
for i in rem_list:
    i = i - s
    del df[df.columns[i]]
    s += 1
'''
