## DATA UNDERSTANDING

In [1]:
#importing libraries
import os # to access files in the opereating system
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
     

In [2]:
#Accessing all image file names to get ages
utkface_path = ('data/UTKFace')
utkface_image_names = os.listdir(utkface_path)

In [3]:
# function to generate the labels from the image file names according to the format given above.

def age_gender_race_split(image_name):
    
    """ This function generates labels from the image file names"""

    image_labels = image_name.split('_')
    age = image_labels[0]
    gender = image_labels[1]
    race = image_labels[2]
    
    

    return (age, gender, race)
     

In [4]:
#using for loops to store labels in relevant arrays relevant 
age_labels = np.array([])
gender_labels = np.array([])
race_labels = np.array([])

for image in utkface_image_names:
    age, gender, race = age_gender_race_split(image)
    age_labels = np.append(age_labels, age)
    gender_labels = np.append(gender_labels, gender)
    race_labels = np.append(race_labels, race)
     

In [5]:
age = pd.Series(age_labels, name = 'Ages')
gender = pd.Series(gender_labels, name = 'Genders')
race = pd.Series(race_labels, name="Races")
df = pd.concat([age, race, gender], axis=1)
df.head()

Unnamed: 0,Ages,Races,Genders
0,100,0,0
1,100,0,0
2,100,0,1
3,100,0,1
4,100,0,1


In [6]:
age_counts=df['Ages'].value_counts()
age_counts

26     2197
1      1123
28      918
35      880
24      859
       ... 
115       3
101       2
91        2
111       1
103       1
Name: Ages, Length: 104, dtype: int64

In [7]:
race_counts=df['Races'].value_counts()
race_counts

0                                 10078
1                                  4526
3                                  3975
2                                  3434
4                                  1692
20170109142408075.jpg.chip.jpg        1
20170116174525125.jpg.chip.jpg        1
20170109150557335.jpg.chip.jpg        1
Name: Races, dtype: int64

We have noticed there are 3 rows that do not follow naming convention. We will handle this in Data preparation

In [8]:
gender_counts= df['Genders'].value_counts()
gender_counts

0    12391
1    11317
Name: Genders, dtype: int64

In [9]:
class DataUnderstanding(object):
    """This is a class that does basic data understanding"""
    def __init__(self, df):
        self.shape = df.shape
        self.info = df.info
        self.duplicates = df.duplicated().sum()
        self.missing = df.isna().sum()
        self.types = df.dtypes


In [10]:
#instantiating the classes
dataund= DataUnderstanding(df)

In [11]:
#Getting the shape of the dataset
print(f"shape:{dataund.shape}")

shape:(23708, 3)


In [12]:
#getting the info of the data
print(dataund.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23708 entries, 0 to 23707
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Ages     23708 non-null  object
 1   Races    23708 non-null  object
 2   Genders  23708 non-null  object
dtypes: object(3)
memory usage: 555.8+ KB
None


we can see that the dataset contains 23708 rows and 3 columns.The 3 columns has data type  string.We can also see there are no null values.

In [13]:
#getting the missing values
dataund.missing

Ages       0
Races      0
Genders    0
dtype: int64

There are no missing values.

In [14]:
#checking the data types
dataund.types

Ages       object
Races      object
Genders    object
dtype: object

The 3 columns have object data type

In [15]:
#checking for duplicates
print(f"Duplictes:{dataund.duplicates.sum()}")

Duplictes:22858


There are 22858 duplicates this is attributed to the exclusion of date and time colum. This will be handled in Data preparation stage

## DATA PREPARATION

In [20]:
#dropping rows 
df = df[df.Races.isin(["0","1","2","3","4"])]
dataund= DataUnderstanding(df)
print(f"Shape:{dataund.shape}")


Shape:(23705, 3)


After dropping the three rows we have 23705 rows.

## MODELING

## EVALUATION

## DEPLOYMENT