# Doc for cleaning and preparing data in python. 

In [None]:
# import the reader function from the csv module
from csv import reader

In [None]:
# use the python built-in function open()
# to open the children.csv file
opened_file = open('children.csv')

In [None]:
# use csv.reader() to parse the data from
# the opened file
read_file = reader(opened_file)

In [None]:
# use list() to convert the read file
# into a list of lists format
children = list(read_file)

In [None]:
#close the opened file
opened_file.close()

In [None]:
# remove the first row of the data, which
# contains the column names
children = children[1:]

Often when we're cleaning data, we need to replace parts of strings so our data is consistent.

For example, let's say we have the string "red is my favorite color", but we want to change it to "blue is my favorite color". To do that, we want to replace the "red" part of the string with "blue". When we want to refer to part of a string, we use the term substring.

In order to do this, we'll learn the str.replace() method. The str.replace() method is like a "find and replace" tool for strings.

When we use str.replace(), we substitute the str for the variable name of the string we want to modify. Let's look at an example in code:

In [None]:
fav_color = "red is my favorite color"
fav_color = fav_color.replace("red", "blue")
print(fav_color)

In [4]:
nationalities = ['(American)', '(Spanish)', '(French)']

for n in nationalities:
    clean_open = n.replace("(","")
    clean_both = clean_open.replace(")","")
    print(clean_both)

American
Spanish
French


In [None]:
# example code for replacing a sub string
for row in moma:
    nationality = row[2]
    nationality = nationality.replace("(","")
    nationality = nationality.replace(")","")
    row[2] = nationality

The str.title() method returns a copy of the string with the first letter of each word transformed to uppercase (also known as title case).

Let's look at an example of this method in action with a simple string:

In [5]:
my_string = "The cool thing about this string is that it has a CoMbInAtIoN of UPPERCASE and lowercase letters!"

my_string_title = my_string.title()
print(my_string_title)

The Cool Thing About This String Is That It Has A Combination Of Uppercase And Lowercase Letters!


In [None]:
# example code 

for row in moma:
    gender = row[5]

    # convert the gender to title case
    gender = gender.title()

    # if there is no gender, set
    # a descriptive value
    if not gender:
        gender = "Gender Unknown/Other"
    row[5] = gender
    
    # Nationality data clean up 
    nationality = row[2]

    # convert the Nationality to title case
    nationality = nationality.title()

    # if there is no nationality, set
    # a descriptive value
    if not nationality:
        nationality = "Nationality Unknown"
    row[2] = nationality

In [None]:
strings = ["good!", "morn?ing", "good?!", "morniZZZZng"]
bad_chars = ["!", "?", "Z"]

# example function to remove multiple substrings

def strip_characters(string):
    for char in bad_chars:
        string = string.replace(char,"")
    return string

# applying the function 
cleaned_strings = []
for s in strings:
    s = strip_characters(s)
    cleaned_strings.append(s)

print(cleaned_strings)



In [None]:
# Example code 
test_data = ["1912", "1929", "1913-1923",
             "(1951)", "1994", "1934",
             "c. 1915", "1995", "c. 1912",
             "(1988)", "2002", "1957-1959",
             "c. 1955.", "c. 1970's", 
             "C. 1990-1999"]

bad_chars = ["(",")","c","C",".","s","'", " "]

def strip_characters(string):
    for char in bad_chars:
        string = string.replace(char,"")
    return string

stripped_test_data = ['1912', '1929', '1913-1923',
                      '1951', '1994', '1934',
                      '1915', '1995', '1912',
                      '1988', '2002', '1957-1959',
                      '1955', '1970', '1990-1999']
def process_date(date):
    if "-" in date:
        split_date = date.split("-")
        date_one = split_date[0]
        date_two = split_date[1]       
        date = (int(date_one) + int(date_two)) / 2
        date = round(date)
    else:
        date = int(date)
    return date

processed_test_data = []

for d in stripped_test_data:
    date = process_date(d)
    processed_test_data.append(date)

for row in moma:
    date = row[6]
    date = strip_characters(date)
    date = process_date(date)
    row[6] = date

The str.format() method is a powerful tool that helps us write easy-to-read code while combining strings with other variables.

There are also extra things that str.format() can do with formatting numbers, but for now we'll focus on inserting values into strings.