## Data Cleaning

#### Using Google Playstore data

In [219]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [220]:
df = pd.read_csv('googleplaystore.csv')

In [221]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [222]:
df.shape

(10841, 13)

In [223]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [116]:
# Checking the Reviews column
df[~df['Reviews'].str.isnumeric()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


So all values except one that is not in numeric.

In [117]:
# Deleating the record at index 10472
df.drop(df.index[10472], inplace=True)

In [118]:
df.shape

(10840, 13)

In [119]:
# Converting the datatype of reviwes to integer
df['Reviews']=df['Reviews'].astype(int)

In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10840 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10840 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9366 non-null   float64
 3   Reviews         10840 non-null  int64  
 4   Size            10840 non-null  object 
 5   Installs        10840 non-null  object 
 6   Type            10839 non-null  object 
 7   Price           10840 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10840 non-null  object 
 10  Last Updated    10840 non-null  object 
 11  Current Ver     10832 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.2+ MB


In [121]:
# Checking missing values
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       0
Genres               0
Last Updated         0
Current Ver          8
Android Ver          2
dtype: int64

In [122]:
df['Rating'].unique()

array([4.1, 3.9, 4.7, 4.5, 4.3, 4.4, 3.8, 4.2, 4.6, 3.2, 4. , nan, 4.8,
       4.9, 3.6, 3.7, 3.3, 3.4, 3.5, 3.1, 5. , 2.6, 3. , 1.9, 2.5, 2.8,
       2.7, 1. , 2.9, 2.3, 2.2, 1.7, 2. , 1.8, 2.4, 1.6, 2.1, 1.4, 1.5,
       1.2])

In [123]:
# Replacing the missing values in the 'Rating' column with the mean value of the column
df['Rating']=df['Rating'].fillna(df['Rating'].mean())

In [124]:
df.isna().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       8
Android Ver       2
dtype: int64

In [125]:
df['Rating'].unique()

array([4.1       , 3.9       , 4.7       , 4.5       , 4.3       ,
       4.4       , 3.8       , 4.2       , 4.6       , 3.2       ,
       4.        , 4.19175742, 4.8       , 4.9       , 3.6       ,
       3.7       , 3.3       , 3.4       , 3.5       , 3.1       ,
       5.        , 2.6       , 3.        , 1.9       , 2.5       ,
       2.8       , 2.7       , 1.        , 2.9       , 2.3       ,
       2.2       , 1.7       , 2.        , 1.8       , 2.4       ,
       1.6       , 2.1       , 1.4       , 1.5       , 1.2       ])

In [126]:
df['Type'].unique()

array(['Free', 'Paid', nan], dtype=object)

In [127]:
# Replacing the missing values in the 'Type' column with the mode value of the column
mode_value=df[df['Type'].notna()]['Type'].mode()[0]

In [128]:
df['Type']=df['Type'].fillna(mode_value)

In [129]:
df.isna().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       8
Android Ver       2
dtype: int64

In [130]:
df['Current Ver'].unique()

array(['1.0.0', '2.0.0', '1.2.4', ..., '1.0.612928', '0.3.4', '2.0.148.0'],
      dtype=object)

In [131]:
# Replacing the missing values in the 'Current Ver' column with the mode value of the column
mode_value=df[df['Current Ver'].notna()]['Current Ver'].mode()[0]
df['Current Ver']=df['Current Ver'].fillna(mode_value)

In [132]:
df.isna().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       2
dtype: int64

In [133]:
df['Android Ver'].unique()

array(['4.0.3 and up', '4.2 and up', '4.4 and up', '2.3 and up',
       '3.0 and up', '4.1 and up', '4.0 and up', '2.3.3 and up',
       'Varies with device', '2.2 and up', '5.0 and up', '6.0 and up',
       '1.6 and up', '1.5 and up', '2.1 and up', '7.0 and up',
       '5.1 and up', '4.3 and up', '4.0.3 - 7.1.1', '2.0 and up',
       '3.2 and up', '4.4W and up', '7.1 and up', '7.0 - 7.1.1',
       '8.0 and up', '5.0 - 8.0', '3.1 and up', '2.0.1 and up',
       '4.1 - 7.1.1', nan, '5.0 - 6.0', '1.0 and up', '2.2 - 7.1.1',
       '5.0 - 7.1.1'], dtype=object)

In [134]:
# Replacing the missing values in the 'Android Ver' column with the mode value of the column
mode_value=df[df['Android Ver'].notna()]['Android Ver'].mode()[0]
df['Android Ver']=df['Android Ver'].fillna(mode_value)

In [135]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

In [136]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [137]:
df['Size'].unique()

array(['19M', '14M', '8.7M', '25M', '2.8M', '5.6M', '29M', '33M', '3.1M',
       '28M', '12M', '20M', '21M', '37M', '2.7M', '5.5M', '17M', '39M',
       '31M', '4.2M', '7.0M', '23M', '6.0M', '6.1M', '4.6M', '9.2M',
       '5.2M', '11M', '24M', 'Varies with device', '9.4M', '15M', '10M',
       '1.2M', '26M', '8.0M', '7.9M', '56M', '57M', '35M', '54M', '201k',
       '3.6M', '5.7M', '8.6M', '2.4M', '27M', '2.5M', '16M', '3.4M',
       '8.9M', '3.9M', '2.9M', '38M', '32M', '5.4M', '18M', '1.1M',
       '2.2M', '4.5M', '9.8M', '52M', '9.0M', '6.7M', '30M', '2.6M',
       '7.1M', '3.7M', '22M', '7.4M', '6.4M', '3.2M', '8.2M', '9.9M',
       '4.9M', '9.5M', '5.0M', '5.9M', '13M', '73M', '6.8M', '3.5M',
       '4.0M', '2.3M', '7.2M', '2.1M', '42M', '7.3M', '9.1M', '55M',
       '23k', '6.5M', '1.5M', '7.5M', '51M', '41M', '48M', '8.5M', '46M',
       '8.3M', '4.3M', '4.7M', '3.3M', '40M', '7.8M', '8.8M', '6.6M',
       '5.1M', '61M', '66M', '79k', '8.4M', '118k', '44M', '695k', '1.6M',
     

In [138]:
# Converting the size of the apps to MegaByte by removing 'M' and 'k'

def convertion(size):
  if 'M' in size:
    size=float(size.replace('M', ''))

  elif 'k' in size:
    size=float(size.replace('k', ''))/1000

  return size


In [139]:
df['Size_in_MB']=df['Size'].apply(convertion)

In [140]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_in_MB
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8


In [144]:
df.drop('Size', axis=1, inplace=True)

In [147]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_in_MB
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8


In [149]:
df['Size_in_MB'].unique()

array([19.0, 14.0, 8.7, 25.0, 2.8, 5.6, 29.0, 33.0, 3.1, 28.0, 12.0, 20.0,
       21.0, 37.0, 2.7, 5.5, 17.0, 39.0, 31.0, 4.2, 7.0, 23.0, 6.0, 6.1,
       4.6, 9.2, 5.2, 11.0, 24.0, 'Varies with device', 9.4, 15.0, 10.0,
       1.2, 26.0, 8.0, 7.9, 56.0, 57.0, 35.0, 54.0, 0.201, 3.6, 5.7, 8.6,
       2.4, 27.0, 2.5, 16.0, 3.4, 8.9, 3.9, 2.9, 38.0, 32.0, 5.4, 18.0,
       1.1, 2.2, 4.5, 9.8, 52.0, 9.0, 6.7, 30.0, 2.6, 7.1, 3.7, 22.0, 7.4,
       6.4, 3.2, 8.2, 9.9, 4.9, 9.5, 5.0, 5.9, 13.0, 73.0, 6.8, 3.5, 4.0,
       2.3, 7.2, 2.1, 42.0, 7.3, 9.1, 55.0, 0.023, 6.5, 1.5, 7.5, 51.0,
       41.0, 48.0, 8.5, 46.0, 8.3, 4.3, 4.7, 3.3, 40.0, 7.8, 8.8, 6.6,
       5.1, 61.0, 66.0, 0.079, 8.4, 0.118, 44.0, 0.695, 1.6, 6.2, 0.018,
       53.0, 1.4, 3.0, 5.8, 3.8, 9.6, 45.0, 63.0, 49.0, 77.0, 4.4, 4.8,
       70.0, 6.9, 9.3, 8.1, 36.0, 84.0, 97.0, 2.0, 1.9, 1.8, 5.3, 47.0,
       0.556, 0.526, 76.0, 7.6, 59.0, 9.7, 78.0, 72.0, 43.0, 7.7, 6.3,
       0.334, 34.0, 93.0, 65.0, 79.0, 100.0, 58.0, 50

In [166]:
# Replacing the 'Varies with device' in the 'Size' column with the mode value

mode_value=df[df['Size_in_MB']!='Varies with device']['Size_in_MB'].mode()[0]
df['Size_in_MB']=df['Size_in_MB'].replace('Varies with device', mode_value)

In [168]:
df['Size_in_MB'].unique()

array([1.90e+01, 1.40e+01, 8.70e+00, 2.50e+01, 2.80e+00, 5.60e+00,
       2.90e+01, 3.30e+01, 3.10e+00, 2.80e+01, 1.20e+01, 2.00e+01,
       2.10e+01, 3.70e+01, 2.70e+00, 5.50e+00, 1.70e+01, 3.90e+01,
       3.10e+01, 4.20e+00, 7.00e+00, 2.30e+01, 6.00e+00, 6.10e+00,
       4.60e+00, 9.20e+00, 5.20e+00, 1.10e+01, 2.40e+01, 9.40e+00,
       1.50e+01, 1.00e+01, 1.20e+00, 2.60e+01, 8.00e+00, 7.90e+00,
       5.60e+01, 5.70e+01, 3.50e+01, 5.40e+01, 2.01e-01, 3.60e+00,
       5.70e+00, 8.60e+00, 2.40e+00, 2.70e+01, 2.50e+00, 1.60e+01,
       3.40e+00, 8.90e+00, 3.90e+00, 2.90e+00, 3.80e+01, 3.20e+01,
       5.40e+00, 1.80e+01, 1.10e+00, 2.20e+00, 4.50e+00, 9.80e+00,
       5.20e+01, 9.00e+00, 6.70e+00, 3.00e+01, 2.60e+00, 7.10e+00,
       3.70e+00, 2.20e+01, 7.40e+00, 6.40e+00, 3.20e+00, 8.20e+00,
       9.90e+00, 4.90e+00, 9.50e+00, 5.00e+00, 5.90e+00, 1.30e+01,
       7.30e+01, 6.80e+00, 3.50e+00, 4.00e+00, 2.30e+00, 7.20e+00,
       2.10e+00, 4.20e+01, 7.30e+00, 9.10e+00, 5.50e+01, 2.30e

In [169]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_in_MB
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8


In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10840 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10840 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          10840 non-null  float64
 3   Reviews         10840 non-null  int64  
 4   Installs        10840 non-null  object 
 5   Type            10840 non-null  object 
 6   Price           10840 non-null  object 
 7   Content Rating  10840 non-null  object 
 8   Genres          10840 non-null  object 
 9   Last Updated    10840 non-null  object 
 10  Current Ver     10840 non-null  object 
 11  Android Ver     10840 non-null  object 
 12  Size_in_MB      10840 non-null  float64
dtypes: float64(2), int64(1), object(10)
memory usage: 1.2+ MB


In [172]:
df['Installs'].unique()

array(['10,000+', '500,000+', '5,000,000+', '50,000,000+', '100,000+',
       '50,000+', '1,000,000+', '10,000,000+', '5,000+', '100,000,000+',
       '1,000,000,000+', '1,000+', '500,000,000+', '50+', '100+', '500+',
       '10+', '1+', '5+', '0+', '0'], dtype=object)

In [173]:
# Removing '+' symbol from the values in the 'Installs column
df['Installs']=df['Installs'].str.split('+').str[0]

In [174]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_in_MB
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8


In [182]:
# Removing commas from the values in the 'Installs' column

def comma(install):
  if ',' in install:
    install.str.replace(',', '')


In [184]:
df['Installs']=df['Installs'].str.replace(',', '')

In [185]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_in_MB
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8


In [187]:
# Changing the datatype of the 'Installs' column to integer
df['Installs']=df['Installs'].astype(int)

In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10840 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10840 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          10840 non-null  float64
 3   Reviews         10840 non-null  int64  
 4   Installs        10840 non-null  int64  
 5   Type            10840 non-null  object 
 6   Price           10840 non-null  object 
 7   Content Rating  10840 non-null  object 
 8   Genres          10840 non-null  object 
 9   Last Updated    10840 non-null  object 
 10  Current Ver     10840 non-null  object 
 11  Android Ver     10840 non-null  object 
 12  Size_in_MB      10840 non-null  float64
dtypes: float64(2), int64(2), object(9)
memory usage: 1.2+ MB


In [190]:
df['Price'].unique()

array(['0', '$4.99', '$3.99', '$6.99', '$1.49', '$2.99', '$7.99', '$5.99',
       '$3.49', '$1.99', '$9.99', '$7.49', '$0.99', '$9.00', '$5.49',
       '$10.00', '$24.99', '$11.99', '$79.99', '$16.99', '$14.99',
       '$1.00', '$29.99', '$12.99', '$2.49', '$10.99', '$1.50', '$19.99',
       '$15.99', '$33.99', '$74.99', '$39.99', '$3.95', '$4.49', '$1.70',
       '$8.99', '$2.00', '$3.88', '$25.99', '$399.99', '$17.99',
       '$400.00', '$3.02', '$1.76', '$4.84', '$4.77', '$1.61', '$2.50',
       '$1.59', '$6.49', '$1.29', '$5.00', '$13.99', '$299.99', '$379.99',
       '$37.99', '$18.99', '$389.99', '$19.90', '$8.49', '$1.75',
       '$14.00', '$4.85', '$46.99', '$109.99', '$154.99', '$3.08',
       '$2.59', '$4.80', '$1.96', '$19.40', '$3.90', '$4.59', '$15.46',
       '$3.04', '$4.29', '$2.60', '$3.28', '$4.60', '$28.99', '$2.95',
       '$2.90', '$1.97', '$200.00', '$89.99', '$2.56', '$30.99', '$3.61',
       '$394.99', '$1.26', '$1.20', '$1.04'], dtype=object)

In [192]:
# Removing '$' symbol from the values in the 'Installs column
df['Price_in_dollars']=df['Price'].str.replace('$', '')

In [193]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_in_MB,Price_in_dollars
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0,0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8,0


In [194]:
# Dropping the 'Price' column
df.drop('Price', axis=1, inplace=True)

In [195]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_in_MB,Price_in_dollars
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0,0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8,0


In [199]:
# Changing the datatype of the 'Price' column to float
df['Price_in_dollars']=df['Price_in_dollars'].astype(float)

In [200]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10840 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   App               10840 non-null  object 
 1   Category          10840 non-null  object 
 2   Rating            10840 non-null  float64
 3   Reviews           10840 non-null  int64  
 4   Installs          10840 non-null  int64  
 5   Type              10840 non-null  object 
 6   Content Rating    10840 non-null  object 
 7   Genres            10840 non-null  object 
 8   Last Updated      10840 non-null  object 
 9   Current Ver       10840 non-null  object 
 10  Android Ver       10840 non-null  object 
 11  Size_in_MB        10840 non-null  float64
 12  Price_in_dollars  10840 non-null  float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1.2+ MB


In [202]:
df['Last Updated'].unique()

array(['January 7, 2018', 'January 15, 2018', 'August 1, 2018', ...,
       'January 20, 2014', 'February 16, 2014', 'March 23, 2014'],
      dtype=object)

In [204]:
# Changing the datatype of the 'Last Updated' column to Date_Time
df['Last Updated']=pd.to_datetime(df['Last Updated'])

In [205]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_in_MB,Price_in_dollars
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,19.0,0.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,14.0,0.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,8.7,0.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,25.0,0.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,2.8,0.0


In [209]:
# Changing the datatype of the 'Last Updated' column to String
df['Last Updated']=df['Last Updated'].astype(str)

In [210]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10840 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   App               10840 non-null  object 
 1   Category          10840 non-null  object 
 2   Rating            10840 non-null  float64
 3   Reviews           10840 non-null  int64  
 4   Installs          10840 non-null  int64  
 5   Type              10840 non-null  object 
 6   Content Rating    10840 non-null  object 
 7   Genres            10840 non-null  object 
 8   Last Updated      10840 non-null  object 
 9   Current Ver       10840 non-null  object 
 10  Android Ver       10840 non-null  object 
 11  Size_in_MB        10840 non-null  float64
 12  Price_in_dollars  10840 non-null  float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1.2+ MB


In [212]:
# Splitting the 'Last Updated' column to Year, Month, Day

df['Last_Updated_Year'] = df['Last Updated'].str.split('-').str[0]
df['Last_Updated_Month'] = df['Last Updated'].str.split('-').str[1]
df['Last_Updated_Day'] = df['Last Updated'].str.split('-').str[2]

In [214]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_in_MB,Price_in_dollars,Last_Updated_Year,Last_Updated_Month,Last_Updated_Day
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,19.0,0.0,2018,1,7
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,14.0,0.0,2018,1,15
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,8.7,0.0,2018,8,1
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,25.0,0.0,2018,6,8
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,2.8,0.0,2018,6,20


In [213]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10840 entries, 0 to 10840
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   App                 10840 non-null  object 
 1   Category            10840 non-null  object 
 2   Rating              10840 non-null  float64
 3   Reviews             10840 non-null  int64  
 4   Installs            10840 non-null  int64  
 5   Type                10840 non-null  object 
 6   Content Rating      10840 non-null  object 
 7   Genres              10840 non-null  object 
 8   Last Updated        10840 non-null  object 
 9   Current Ver         10840 non-null  object 
 10  Android Ver         10840 non-null  object 
 11  Size_in_MB          10840 non-null  float64
 12  Price_in_dollars    10840 non-null  float64
 13  Last_Updated_Year   10840 non-null  object 
 14  Last_Updated_Month  10840 non-null  object 
 15  Last_Updated_Day    10840 non-null  object 
dtypes: f

In [215]:
# Converting the DataTypr of 'Last_Updated_Year','Last_Updated_Month', 'Last_Updated_Day' to Integer

df[['Last_Updated_Year'	,'Last_Updated_Month',	'Last_Updated_Day']] = df[['Last_Updated_Year'	,'Last_Updated_Month',	'Last_Updated_Day']].astype(int)

In [216]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10840 entries, 0 to 10840
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   App                 10840 non-null  object 
 1   Category            10840 non-null  object 
 2   Rating              10840 non-null  float64
 3   Reviews             10840 non-null  int64  
 4   Installs            10840 non-null  int64  
 5   Type                10840 non-null  object 
 6   Content Rating      10840 non-null  object 
 7   Genres              10840 non-null  object 
 8   Last Updated        10840 non-null  object 
 9   Current Ver         10840 non-null  object 
 10  Android Ver         10840 non-null  object 
 11  Size_in_MB          10840 non-null  float64
 12  Price_in_dollars    10840 non-null  float64
 13  Last_Updated_Year   10840 non-null  int64  
 14  Last_Updated_Month  10840 non-null  int64  
 15  Last_Updated_Day    10840 non-null  int64  
dtypes: f

In [217]:
df.drop('Last Updated', axis=1, inplace=True)

In [218]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Content Rating,Genres,Current Ver,Android Ver,Size_in_MB,Price_in_dollars,Last_Updated_Year,Last_Updated_Month,Last_Updated_Day
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,Everyone,Art & Design,1.0.0,4.0.3 and up,19.0,0.0,2018,1,7
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,Everyone,Art & Design;Pretend Play,2.0.0,4.0.3 and up,14.0,0.0,2018,1,15
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,Everyone,Art & Design,1.2.4,4.0.3 and up,8.7,0.0,2018,8,1
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,Teen,Art & Design,Varies with device,4.2 and up,25.0,0.0,2018,6,8
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,Everyone,Art & Design;Creativity,1.1,4.4 and up,2.8,0.0,2018,6,20
