# Urbanmetry Data Cleaning Challenge

In [185]:
import pandas as pd
import numpy as np

## Challenge 1: String Scrubbing
The paragraph given below contains special characters that are usually the 
    source of great evil that distrupts subsequent functions if left untreated.
    Write a method to cleanse the paragraph of non-alpha numeric characters,
    and make them all lowercase and trimmed'

    paragraph = "Grant me the ’S3R3NITY’ to accept the things I cannot change –
                  The ’C0URAGE’ to change the things I can –
                    And the ’W1SD0M’ to know the difference"

    expected_answer = "grant me the s3r3nity to accept the things i cannot change the c0urage to change the things i can and the w1sd0m to know the difference"

In [54]:
paragraph = "Grant me the ’S3R3NITY’ to accept the things I cannot change –\
                  The ’C0URAGE’ to change the things I can –\
                    And the ’W1SD0M’ to know the difference"

def sanitize_paragraph(paragraph):
    par_clean = [let for let in paragraph if let not in '’–']
    par_clean_join = ''.join(par_clean).lower()
    # return clean string
    return ' '.join(par_clean_join.split())

if __name__ == '__main__':
    print(sanitize_paragraph(paragraph))

grant me the s3r3nity to accept the things i cannot change the c0urage to change the things i can and the w1sd0m to know the difference


## Challenge 2: Substring Extraction
Given the following array of address string, extract each of their state. Your solution should accomodate all of the cases'

    addresses = [
      "SIBU - JALAN JERRWIT TIMUR, Jalan Jerrwit Timur, Sibu, Sarawak",
      "KAMPUNG KUBUR SHARIF, Bukit Rakit, Kuala Terengganu, Terengganu Darul Iman",
      "Persiaran Laksamana, Puteri Harbour, 79250, Johor",
      "LOT PT 6458, Kuala Berang, Hulu Terengganu, Terengganu",
      "PANGSAPURI CEMPAKA,Bandar Bukit Puchong, 47100 Puchong, Selangor Darul Ehsan",
      "OASIS ARA DAMANSARA, JALAN PJU 7A/1A, ARA DAMANSARA, 47301 PJ, SELANGOR, MALAYSIA",
    ]

    expected_answer = [
      "SARAWAK", "TERENGGANU", "JOHOR", "TERENGGANU", "SELANGOR", "SELANGOR"
    ]

In [55]:
addresses = [
      "SIBU - JALAN JERRWIT TIMUR, Jalan Jerrwit Timur, Sibu, Sarawak",
      "KAMPUNG KUBUR SHARIF, Bukit Rakit, Kuala Terengganu, Terengganu Darul Iman",
      "Persiaran Laksamana, Puteri Harbour, 79250, Johor",
      "LOT PT 6458, Kuala Berang, Hulu Terengganu, Terengganu",
      "PANGSAPURI CEMPAKA,Bandar Bukit Puchong, 47100 Puchong, Selangor Darul Ehsan",
      "OASIS ARA DAMANSARA, JALAN PJU 7A/1A, ARA DAMANSARA, 47301 PJ, SELANGOR, MALAYSIA",
    ]

In [184]:
def get_state(address):
    import pandas as pd
    df = pd.Series(address)
    state = []
    for x in df_1:
    # extract state
        if x[len(x)-1].lower().split() != ['malaysia']:
            stat = x[len(x)-1].split()
            state.append(stat[0].upper())
        else: 
            stat = x[len(x)-2].split()
            state.append(stat[0].upper())
    return state

if __name__ == '__main__':
    print(get_state(addresses))

['SARAWAK', 'TERENGGANU', 'JOHOR', 'TERENGGANU', 'SELANGOR', 'SELANGOR']


## Challenge 3: Parsing
Given the following array of coordinate, convert it into Well-Known Text (WKT) format.
    Wiki link on WKT and its format: https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry
    Hint: recursion might be a friend here'

    coordinate_pair = [[[[30,20], [45,40], [10,40], [30,20]]], [[[15,5], [40,10], [10,20], [5,10], [15,5]]]]
    expected_answer = "MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))"

In [251]:
coordinate_pair = [[[[30,20], [45,40], [10,40], [30,20]]], [[[15,5], [40,10], [10,20], [5,10], [15,5]]]]
expected_answer = "MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))"

In [256]:
# static answer
# convert the nested list to strings and remove all symbols
cp_str = ''.join([cor for cor in str(coordinate_pair) if cor not in '][,'])
# convert string into list of integers
cp = pd.Series(cp_str.split()).apply(lambda x: int(x)).tolist()
# output coordinate into string
output = f'MULTIPOLYGON ((({cp[0]} {cp[1]}, {cp[2]} {cp[3]}, {cp[4]} {cp[5]}, {cp[6]} {cp[7]})), (({cp[8]} {cp[9]}, {cp[10]} {cp[11]}, {cp[12]} {cp[13]}, {cp[14]} {cp[15]}, {cp[16]} {cp[17]})))'
output == expected_answer

True

In [330]:
# dynamic answer
def coord_to_wkt(coordinate_pair):
    cp_str = ''.join([cor for cor in str(coordinate_pair) if cor not in '][,'])
    cp = pd.Series(cp_str.split()).apply(lambda x: int(x))
    wtk = "MULTIPOLYGON ((("
    for i, x in enumerate(cp):
        i+=1
        if i < 8:
            if i%2 != 0: #odd
                wtk += f'{x}'
            elif i < 7: wtk += f' {x}, '
        elif i == 8 : wtk += f' {x})), (('
        elif i >= 9 and i <= 17:
            if i%2 != 0: #odd
                wtk += f'{x}'
            else: wtk += f' {x}, '
        else: wtk += f' {x})))'
    return wtk

if __name__ == '__main__':
    print(expected_answer == coord_to_wkt(coordinate_pair))

True


## Challenge 4: is_palindrome?
    Palindrome is a sequence of characer that reads the same backward as it is forward, eg: 
    KAYAK when spelled in reverse is still KAYAK; MADAM, RACECAR etc.
    Write a method that accepts a string & returns true or false depending on 
    whether the input is a palindrome or not.
    Bonus points for the elegant recursive solution!

In [342]:
pal = 'KAYAK'

In [344]:
def is_palindrome(word):
    if word[::-1] == word:
        print('True')
    else: print('False')
    return

if __name__ == '__main__':
    is_palindrome(pal)

True


---
<center><h1>END