In [1]:
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame

### Exercise

True or False? 

In [2]:
ser = pd.Series([1, 5, -2], index=list("abc"))
ser.drop(['b'], inplace=True)

result = ser.iloc[1] == -2

In [3]:
print(result)

True


In [6]:
print(ser)

a    1
c   -2
dtype: int64


<font color="red">True</font>. The drop method modifies the Series with inplace=True. 

### Exercise

True or False? 

In [7]:
df = pd.DataFrame(np.arange(12).reshape(4,3))

df.iloc[1] *= 2
result = df.iloc[1,2] == 10

In [8]:
print(result)

True


In [9]:
df

Unnamed: 0,0,1,2
0,0,1,2
1,6,8,10
2,6,7,8
3,9,10,11


<font color="red">True</font>. The multiplication step multiplies every cell in the whole row by 2. 

### Exercise

True or False? 

In [10]:
df = pd.DataFrame(np.arange(12).reshape(4,3),
                 index=list("abcd"), columns=list("xyz"))

df.iloc[1] *= 2
result = df.iloc[1,2] == 10

In [11]:
result

True

Exactly the same as above. The index and column names provided
have nothing to do with iloc. 

### Exercise

True or False? 

In [16]:
df = pd.DataFrame(np.arange(12).reshape(4,3),
                 index=list("abcd"), columns=list("xyz"))

df["y"] *= 2
result = df.iloc[1,0] == 2

In [17]:
result

False

The whole second column was multiplied by 2. `df.iloc[1,0]` accesses the first column of the second row. 

#### <font color="brown">Working with NaNs</font>

#### Remove rows/columns with NaNs using dropna

In [19]:
from numpy import nan as NA
datf = DataFrame([[1, 3.8, 2.1],
                  [2, NA, NA],
                  [NA, NA, NA],
                  [NA, 4.8, 1.7]])
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


**Drop rows that have an NaN in any column**

In [20]:
datf.dropna()

Unnamed: 0,0,1,2
0,1.0,3.8,2.1


In [21]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


Original is not modified, use `inplace=True` to modify original

In [22]:
datf1 = datf.copy()
datf1.dropna(inplace=True)
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


In [23]:
# to do the same with columns, pass axis=1
datf.dropna(axis=1)

0
1
2
3


**To drop only those rows/columns that have NaN in ALL columns**

In [24]:
# drop rows that have NaNs in all columns
datf.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
3,,4.8,1.7


In [25]:
# drop rows that have NaNs in all columns
datf.dropna(how='all',axis=1)  # none of the columns are entirely NAs

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


#### Filling NaNs with values

**Replace all NaNs with single value**

In [26]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


In [27]:
datf.fillna(99999)

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,99999.0,99999.0
2,99999.0,99999.0,99999.0
3,99999.0,4.8,1.7


**Replace all NaNs in row or column using ffill (forward fill)**

In [28]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


In [29]:
# column-wise
datf.fillna(method='bfill')

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,4.8,1.7
2,,4.8,1.7
3,,4.8,1.7


In [30]:
datf   # original not modified

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


In [31]:
# row-wise
datf.fillna(method='ffill',axis=1)

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,2.0,2.0
2,,,
3,,4.8,1.7


**Replace all NaNs in multiple columns using dictionary**

In [32]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


In [33]:
datf.fillna({1: 2.5, 2: 1.5})

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,2.5,1.5
2,,2.5,1.5
3,,4.8,1.7


In [34]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


**Treat column/row separately as Series, and use fillna**

In [35]:
# column as Series and fillna
datf[2].fillna(1.5)

0    2.1
1    1.5
2    1.5
3    1.7
Name: 2, dtype: float64

In [36]:
# row as Series, and fillna inplace
datfc = datf.copy()
datfc.loc[2].fillna(-1,inplace=True)
datfc

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,-1.0,-1.0,-1.0
3,,4.8,1.7


---
#### <font color="brown">One way to deal with missing numeric data is to replace with mean</font>

In [37]:
mpgfile = open("auto_mpg_original.csv")
mpgs = pd.read_csv(mpgfile)
mpgs

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino
...,...,...,...,...,...,...,...,...,...
401,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,1.0,ford mustang gl
402,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,2.0,vw pickup
403,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,1.0,dodge rampage
404,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,1.0,ford ranger


In [38]:
mpgs['mpg'].mean()

23.514572864321607

In [39]:
mpgs2 = mpgs.copy()

In [40]:
mpgs2[mpgs2['mpg'].isnull()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
10,,4.0,133.0,115.0,3090.0,17.5,70.0,2.0,citroen ds-21 pallas
11,,8.0,350.0,165.0,4142.0,11.5,70.0,1.0,chevrolet chevelle concours (sw)
12,,8.0,351.0,153.0,4034.0,11.0,70.0,1.0,ford torino (sw)
13,,8.0,383.0,175.0,4166.0,10.5,70.0,1.0,plymouth satellite (sw)
14,,8.0,360.0,175.0,3850.0,11.0,70.0,1.0,amc rebel sst (sw)
17,,8.0,302.0,140.0,3353.0,8.0,70.0,1.0,ford mustang boss 302
39,,4.0,97.0,48.0,1978.0,20.0,71.0,2.0,volkswagen super beetle 117
367,,4.0,121.0,110.0,2800.0,15.4,81.0,2.0,saab 900s


##### **Use fillna method on relevant column (Series)**

In [41]:
mpgs2['mpg'] = mpgs2['mpg'].fillna(mpgs2['mpg'].mean())

In [42]:
mpgs2.loc[10:14]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
10,23.514573,4.0,133.0,115.0,3090.0,17.5,70.0,2.0,citroen ds-21 pallas
11,23.514573,8.0,350.0,165.0,4142.0,11.5,70.0,1.0,chevrolet chevelle concours (sw)
12,23.514573,8.0,351.0,153.0,4034.0,11.0,70.0,1.0,ford torino (sw)
13,23.514573,8.0,383.0,175.0,4166.0,10.5,70.0,1.0,plymouth satellite (sw)
14,23.514573,8.0,360.0,175.0,3850.0,11.0,70.0,1.0,amc rebel sst (sw)


In [43]:
mpgs[mpgs['horsepower'].isnull()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
38,25.0,4.0,98.0,,2046.0,19.0,71.0,1.0,ford pinto
133,21.0,6.0,200.0,,2875.0,17.0,74.0,1.0,ford maverick
337,40.9,4.0,85.0,,1835.0,17.3,80.0,2.0,renault lecar deluxe
343,23.6,4.0,140.0,,2905.0,14.3,80.0,1.0,ford mustang cobra
361,34.5,4.0,100.0,,2320.0,15.8,81.0,2.0,renault 18i
382,23.0,4.0,151.0,,3035.0,20.5,82.0,1.0,amc concord dl


In [44]:
mpgs2['horsepower'] = mpgs2['horsepower'].fillna(mpgs2['horsepower'].mean())

In [45]:
mpgs2.loc[[38,133]]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
38,25.0,4.0,98.0,105.0825,2046.0,19.0,71.0,1.0,ford pinto
133,21.0,6.0,200.0,105.0825,2875.0,17.0,74.0,1.0,ford maverick


In [46]:
mpgs2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           406 non-null    float64
 1   cylinders     406 non-null    float64
 2   displacement  406 non-null    float64
 3   horsepower    406 non-null    float64
 4   weight        406 non-null    float64
 5   acceleration  406 non-null    float64
 6   model year    406 non-null    float64
 7   origin        406 non-null    float64
 8   car name      406 non-null    object 
dtypes: float64(8), object(1)
memory usage: 28.7+ KB


### <font color="brown">Regular Expressions</font>

Tutorials can be found at the following sites

1. https://www.w3schools.com/python/python_regex.asp
2. https://developers.google.com/edu/python/regular-expressions#basic-patterns
3. https://docs.python.org/3/howto/regex.html?highlight=regular%20expressions

And the site https://regex101.com/ has a regular expression engine you can use to try things out.


#### <font color="brown">Import the re module</font>

In [51]:
import re

#### <font color="brown">Search for a pattern in a string using re.search function</font>

In [52]:
res = re.search('a','cat')  # search for pattern 'a' in target 'cat'
res

<re.Match object; span=(1, 2), match='a'>

**search returns a Match object: span(1,2) is the span from start index to end index (exclusive)<br>
of target string "cat" where the match is found, and match gives the actual match**

In [53]:
res = re.search('a','dog')
print(res)

None


**If you simply echo res, nothing will be echoed since res is null, see below**

In [54]:
res

**So it's good policy to print the return from search, in case the return was None**

In [55]:
print ('matched') if re.search('a','dog') else print('not matched')

res = re.search('a','dog')
if res:
    print('matched')
else:
    print('not matched')

not matched
not matched


**search returns the first occurrence of a match, in case there are multiple matches**

In [56]:
res = re.search('ar','barbaric')  
print(res)

<re.Match object; span=(1, 3), match='ar'>


In [1]:
# when searching, because failure is possible, use condition
def searchit(pattern,astr): 
    if re.search(pattern,astr):   # same as if re.search(pattern,astr) != None
        return 'Matched'
    else:
        return 'No match' 

print(searchit('a','cat'))
print(searchit('a','dog'))
print(searchit('ar','barbaric'))

NameError: name 're' is not defined

**<font color="red">Matching literal strings is faster with string method</font>**

In [58]:
def findit(litstr,target):
    if target.find(litstr) == -1:
        return 'No match'
    else:
        return 'Matched'
    
print(searchit('a','cat'))
print(searchit('a','dog'))
print(searchit('ar','barbaric'))

Matched
No match
Matched


In [59]:
def findit(litstr,target):
    res = 'No Match' if target.find(litstr) == -1 else 'Matched'
    return res
    
print(searchit('a','cat'))
print(searchit('a','dog'))
print(searchit('ar','barbaric'))

Matched
No match
Matched


#### <font color="brown">Writing regexp patterns with metacharacters</font>

**Metacharacter [ ] is used for a class of characters<br>
Metacharacter * means 0 or more of preceding character/class<br>
Metacharacter + means 1 or more of preceding character/class**

**Example 1**<br>
Search for any sequence of characters that starts with 'a', ends with 't', and has zero or more 'c's in between

In [63]:
test_strings = ["at", "act", "acccct", "attraction", "account"]
for s in test_strings:
    res = re.search('ac*t', s)  # uses metacharacter '*'
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'at' matches
'act' matches
'acccct' matches
'attraction' matches
'account' does not match


**Example 2**<br>
Search for any sequence of characters that starts with 'a', ends with 't', and has AT LEAST one 'c' in between

In [64]:
test_strings = ["at", "act", "acccct", "attraction", "account"]
for s in test_strings:
    res = re.search('ac+t',s)  # uses metacharacter +
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'at' does not match
'act' matches
'acccct' matches
'attraction' matches
'account' does not match


**Example 3**<br>
Search for any sequence that starts with a, ends with t, and has any number of digits (zero included) in between

In [66]:
test_strings = ["at", "act", "a98t", "cab54tr", "spa49", "bla64trx"]
for s in test_strings:
    res = re.search('a[0-9]*t',s)  # uses metacharacters [] and *
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'at' matches
'act' does not match
'a98t' matches
'cab54tr' does not match
'spa49' does not match
'bla64trx' matches


**Example 4**<br>
Search for any sequence that starts with a, ends with t, and has any number of letters or digits (zero included) in between

In [67]:
test_strings = ["at", "act", "a98t", "cab54tr", "spa49", "bla64trx"]
for s in test_strings:
    res = re.search('a[a-zA-Z0-9]*t',s)  # uses metacharacters [] and *
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'at' matches
'act' matches
'a98t' matches
'cab54tr' matches
'spa49' does not match
'bla64trx' matches


**Example 5**<br>
Search for any sequence that starts with a, ends with t, and has AT LEAST one letter and one digit between, in that order<br>
i.e. between a and t, all letters must precede all digits

In [68]:
test_strings = ["at", "act", "a98t", "cab54tr", "spa49", "bla64trx"]
for s in test_strings:
    res = re.search('a[a-zA-Z]+[0-9]+t',s)  # uses metacharacters [] and +
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'at' does not match
'act' does not match
'a98t' does not match
'cab54tr' matches
'spa49' does not match
'bla64trx' does not match


**Metacharacter . matches any character**

**Example**<br>
Search for any sequence that starts with a, ends with t, and has any character any number of times (including zero) between

In [69]:
test_strings = ["at", "act", "a98t", "cab54tr", "spa49", "bla64trx"]
for s in test_strings:
    res = re.search('a.t',s)  # uses metacharacters . and *
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'at' does not match
'act' matches
'a98t' does not match
'cab54tr' does not match
'spa49' does not match
'bla64trx' does not match


**Metacharacter ? matches one or zero occurrence of preceding character**

**Example**<br>
Search for the sequence 'act' or 'at' in any string

In [70]:
res = re.search('ac?t','at')
print(res)
res = re.search('ac?t','act')
print(res)
res = re.search('ac?t','tractor')
print(res)
res = re.search('ac?t','accct')
print(res)

<re.Match object; span=(0, 2), match='at'>
<re.Match object; span=(0, 3), match='act'>
<re.Match object; span=(2, 5), match='act'>
None


### Exercise 1

Write a regular expression to match any valid Rutgers netID. A netID should have at least one lowercase letter followed by at least one digit. 

```
test_strings = ["jwb163", "163jwb", "j16w3b", "jwb", "163"]
for s in test_strings: 
    res = re.search(pattern, s)
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'jwb163' matches
'163jwb' does not match
'j16w3b' matches
'jwb' does not match
'163' does not match
```

In [71]:
pattern = ""

In [76]:
test_strings = ["jwb163", "163jwb", "j16w3b", "jwb", "163"]
for s in test_strings: 
    res = re.search(pattern, s)
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'jwb163' matches
'163jwb' does not match
'j16w3b' matches
'jwb' does not match
'163' does not match


### Exercise 2

Write a regular expression that matches any word with two consecutive vowels. In other words, any sequence of capital or lowercase letters that contains a vowel (a,e,i,o,u) followed by another vowel. 

```
test_strings = ["teacher", "student", "quizzes", "BOOT100", "CoRE"]
for s in test_strings: 
    res = re.search(pattern, s)
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'teacher' matches
'student' does not match
'quizzes' matches
'BOOT100' matches
'CoRE' does not match
```

In [80]:
test_strings = ["teacher", "student", "quizzes", "BOOT100", "CoRE"]
pattern = ""

In [81]:
for s in test_strings: 
    res = re.search(pattern, s)
    print(f"'{s}' matches") if res else print(f"'{s}' does not match")

'teacher' matches
'student' does not match
'quizzes' matches
'BOOT100' matches
'CoRE' does not match
