# Week 12 Data Transformations


## I. Remove Duplicates

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [4]:
# Identify duplicated rows
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [6]:
# Drop duplicated rows
data.drop_duplicates()

# If you want this data frame instead of the original one, overwrite the data frame
# data = data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [7]:
# Drop duplicated values from column k2
data.drop_duplicates(['k2'])

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [8]:
# Drop duplicated values from column k1
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,1


## II. Transform Data Using a Function or Mapping

In [9]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [10]:
# Suppose that we want to map the meat type to the kind of animal:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [13]:
# To make matching simpler, change strings to lowercase first
lowercased = data['food'].str.lower()
# lowercased
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [15]:
x = 'bacon'
meat_to_animal[x]

'pig'

In [19]:
def get_animal(x):
    x = x.lower()
    return meat_to_animal[x]

In [20]:
get_animal('Pastrami')

'cow'

In [21]:
# We can also pass a function
data['food'].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [22]:
# The function can be defined by a lambda expression
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [23]:
# The lambda expression defines a function without name.
# We can also use apply() to apply this function.
data['food'].apply(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [24]:
# Lambda function: a simple way to write a short function
def square(x):
    return x * x

square(2)

4

In [25]:
df = pd.DataFrame({'Col1': [1, 2, 3, 4]})
df

Unnamed: 0,Col1
0,1
1,2
2,3
3,4


In [26]:
df['Col2'] = df['Col1'].apply(square)
df

Unnamed: 0,Col1,Col2
0,1,1
1,2,4
2,3,9
3,4,16


In [27]:
# The following statement squares the values without explictly defining a function
df['Col3'] = df['Col1'].apply(lambda x: x ** 2)
df

Unnamed: 0,Col1,Col2,Col3
0,1,1,1
1,2,4,4
2,3,9,9
3,4,16,16


In [28]:
# lambda expression with a branch
def abs_val(x):
    if x >= 0:
        return x
    else:
        return -x
    
abs_val(-100)

100

In [29]:
df['Col1'].apply(abs_val)

0    1
1    2
2    3
3    4
Name: Col1, dtype: int64

In [None]:
df['abs'] = df['Col1'].apply(lambda x: x if x >= 0 else -x)
df

In [31]:
# Ex: Determine whether a student passes the class.
grades = pd.DataFrame([["Alice", 98],
                        ["Bob", 87],
                        ["Charle", 76],
                        ["David", 65],
                        ["Eva", 54], 
                        ["Fred", 43],
                        ["Gill", 32]],
                       columns=["Name", "Grade"])
grades

Unnamed: 0,Name,Grade
0,Alice,98
1,Bob,87
2,Charle,76
3,David,65
4,Eva,54
5,Fred,43
6,Gill,32


In [35]:
# I want to create a column to indicate whether a student passes the class or not.

# Option 1: create a function to check a grade, and then apply the function to the grade column.
def grade_check(grade):
    
    if grade >= 60:
        return "Pass"
    else:
        return "Fail"
    
# grades['Grade'].apply(grade_check)
# grades['Grade'].map(grade_check)

# Option 2: apply a lambda expression to determine whether a value is pass or fail.
# grades['Grade'].apply(lambda y: 1 if y >= 60 else 0)
grades['Grade'].apply(lambda y: "Pass" if y >= 60 else "Fail")

0    Pass
1    Pass
2    Pass
3    Pass
4    Fail
5    Fail
6    Fail
Name: Grade, dtype: object

In [36]:
# What if the transformation requires more than one argument?
# Suppose we need to calculate the final grade using the following formula:
# Final = Homework * 0.2 + Test * 0.4 + Project * 0.4
grades = pd.DataFrame([["Alice", 100, 90, 80],
                       ["Bob", 98, 87, 76],
                       ["Charle", 57, 68, 80]],
                      columns=["Name", "Homework", "Test", "Project"])
grades

Unnamed: 0,Name,Homework,Test,Project
0,Alice,100,90,80
1,Bob,98,87,76
2,Charle,57,68,80


In [37]:
# The final grade can be calculated directly.
grades['Final'] = grades['Homework'] * 0.2 + grades['Test'] * 0.4 + grades['Project'] * 0.4
grades

Unnamed: 0,Name,Homework,Test,Project,Final
0,Alice,100,90,80,88.0
1,Bob,98,87,76,84.8
2,Charle,57,68,80,70.6


In [38]:
# Instead, we can define a function on a row to calculate the final grade
def cal_final(row):
    return row['Homework'] * 0.2 + row['Test'] * 0.4 + row['Project'] * 0.4

cal_final(grades.loc[0, :])

88.0

In [40]:
# Apply this function to the entire data frame.
grades.apply(cal_final, axis=1) # If the entire row is the input of a function, use axis=1.

0    88.0
1    84.8
2    70.6
dtype: float64

In [43]:
# What if I want to curve the grades? I want to drop the lowest grade from the calculation.
def cal_final(x):
    
    min_grade = x[['Homework', 'Test', 'Project']].min()
    
    if min_grade == x['Homework']:
        final_grade = x['Test'] * 0.5 + x['Project'] * 0.5
    elif min_grade == x['Test']:
        final_grade = x['Homework'] * 0.4 + x['Project'] * 0.6
    else:
        final_grade = x['Homework'] * 0.4 + x['Test'] * 0.6
        
    return final_grade

In [45]:
grades.apply(cal_final, axis=1)

0    94.0
1    91.4
2    74.0
dtype: float64

In [47]:
# Apply a function with additional arguments
# Let's use credit score and an approval threshold to determine whether to approve a loan application
applications = pd.DataFrame([["Alice", 500, 100_000],
                             ["Bob", 600, 200_000],
                             ["Charle", 700, 300_000],
                             ["David", 800, 400_000]],
                            columns=["Name", "CreditScore", "LoanAmount"])
applications

Unnamed: 0,Name,CreditScore,LoanAmount
0,Alice,500,100000
1,Bob,600,200000
2,Charle,700,300000
3,David,800,400000


In [49]:
def loan_approval(credit_score, threshold):
    if credit_score > threshold:
        return "Approve"
    else:
        return "Reject"
    
loan_approval(500, 350)

'Approve'

In [55]:
# If the current threshold is 550, what happens to the applications?
applications['CreditScore'].apply(loan_approval, args=(750,))

0     Reject
1     Reject
2     Reject
3    Approve
Name: CreditScore, dtype: object

## III. String Manipulation


In [56]:
# Use split() to separate a string
string = "a, b, c, d"
string.split(',')

['a', ' b', ' c', ' d']

In [58]:
# split() is often combined with strip to trim whitespace
string_pieces = string.split(',')
print(string_pieces)
string_pieces_cleaned = [x.strip() for x in string_pieces]
print(string_pieces_cleaned)

['a', ' b', ' c', ' d']
['a', 'b', 'c', 'd']


In [59]:
# Use + to concatenate strings
string = "I" + " " + "like" + " " + "pizza."
print(string)

I like pizza.


In [60]:
# Use join() to concatenate a list of strings with delimiter
names = ["Alex", "Brian", "Charlie", "Douglas"]
string = "\n".join(names)
print(string)

Alex
Brian
Charlie
Douglas


In [66]:
# Use index() and find() to detect a substring
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
# print("DEF" in alphabet)
# print(alphabet.find("GHI")) # find() will return -1 if the substring does not exist
# print(alphabet.index("Alex")) # index() will return an error if the substring does not exist
print(alphabet.find("abc"))

-1


In [None]:
# Extract the substring from alphabet starting at index 10, ending at index 20
substring = alphabet[10:20]
print(substring)

In [68]:
# count() returns the number of occurences of a substring
print(alphabet.count("DEF"))
print(string.count("\n"))

1
3


In [69]:
# replace() is used to replace a substring for another
print(string.replace("Alex", "Alexander"))

Alexander
Brian
Charlie
Douglas


In [70]:
# replace() can also be used to delete a substring:
print(string.replace("\n", ""))

AlexBrianCharlieDouglas


## Regular Expressions
**Regular expressions** provide a flexible way to search or match complex string patterns in text.Python's built-in `re` module is responsible for applying regular expressions to string. Let's have a look at some examples.

In [72]:
import re
# Example 1: Split a string with a variable number of whitespace
string = "a  b    c    d \t e  \n  f   g"
print(string)
# string.split(' ') # This does not work
pieces = re.split('\s+', string) # \s represents the whitespace character, + means one or more.
print(pieces)

a  b    c    d 	 e  
  f   g
['a', 'b', 'c', 'd', 'e', 'f', 'g']


Useful `re` functions:
- findall()
- search()
- split()
- sub()

In [73]:
re.findall('\s+', string)

['  ', '    ', '    ', ' \t ', '  \n  ', '   ']

In [74]:
match = re.search('\s+', string)
# print(match)
print("Substring:", match.group())
print("Location:", match.span())
print("Start:", match.start())
print("End:", match.end())

Substring:   
Location: (1, 3)
Start: 1
End: 3


In [75]:
re.sub('\s+', ',', string)

'a,b,c,d,e,f,g'

**Construct a regular expression:**

[Reference](https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285)

1. Anchors
    - ^The: **Starts with** The
    - day\$: **Ends with** day
2. Quantifiers:
    - ab\s\*: ab followed by **zero or more** whitespaces
    - ab\s+: ab followed by **one or more** whitespaces
    - ab\s?: ab followed by **zero or one** whitespaces
    - ab\s{2}: ab followed by **exactly 2** whitespaces
    - ab\s{2, 5}: ab followed by **2 - 5** whitespaces
    - ab\s{2, }: ab followed by **2 or more** whitespaces
3. OR operator
    - a(b|c): a followed by **b or c**
    - a[bc]: same as above
4. Character classes
    - \d: a single digit
    - \w: a single letter or underscore
    - \s: a single whitespace
    - .: any character
    - \D: a single non-digit
    - \W: a single character that is not a letter or underscore
    - \S: a single non-space
5. Bracket expression
    - [a-c]: a or b or c
    - [0-7]: a digit between 0 and 7
    - [^a-c]: a letter not a, b, or c
6. Greedy match
    - <*+{}>: any character included in <>, **expanding as far as possible**
7. Capturing:
    - a(bc): **capture** the group with value bc

In [None]:
# Example 1: Extract Social Security Number
string = "Is 123-45-6789 your SSN? Answer: No."
pattern = ".*(\d{3})-(\d{2})-(\d{4}).*"
regex = re.compile(pattern)
# print(regex)
match = regex.match(string)
# print(match)
print("".join(match.groups()))

In [1]:
# Example 2: Extract dollar amount
# Is there a dollar amount in the sentence?
sentence = "The price of a cup of coffee is $3.50."
pattern = ??

# What is the dollar amount?



SyntaxError: invalid syntax (<ipython-input-1-ddcd590df85a>, line 4)