# Chapter 8. Data Wrangling: Join, Combine, and Reshape
## 8.1 Hierarchical Indexing

In [1]:
import pandas as pd
import numpy as np
data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1    0.678199
   2    2.037511
   3    0.001289
b  1   -0.919798
   3   -0.960764
c  1   -1.304534
   2   -0.406015
d  2    0.520970
   3    1.547000
dtype: float64

In [2]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

In [3]:
data['b']

1   -0.919798
3   -0.960764
dtype: float64

In [4]:
data['b':'c']

b  1   -0.919798
   3   -0.960764
c  1   -1.304534
   2   -0.406015
dtype: float64

In [5]:
data.loc[['b', 'd']]

b  1   -0.919798
   3   -0.960764
d  2    0.520970
   3    1.547000
dtype: float64

In [6]:
data.loc[:, 2]

a    2.037511
c   -0.406015
d    0.520970
dtype: float64

In [7]:
data.unstack()

Unnamed: 0,1,2,3
a,0.678199,2.037511,0.001289
b,-0.919798,,-0.960764
c,-1.304534,-0.406015,
d,,0.52097,1.547


In [8]:
data.unstack().stack()

a  1    0.678199
   2    2.037511
   3    0.001289
b  1   -0.919798
   3   -0.960764
c  1   -1.304534
   2   -0.406015
d  2    0.520970
   3    1.547000
dtype: float64

In [9]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [10]:
frame.index.names = ['key1', 'key2']
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [11]:
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [12]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [13]:
pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'],
                          ['Green', 'Red', 'Green']],
                          names=['state', 'color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

In [14]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [15]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [16]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [17]:
frame.swaplevel('key1', 'key2').sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [18]:
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [19]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [20]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [21]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                      'c': ['one', 'one', 'one', 'two', 'two','two', 'two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [22]:
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [23]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [24]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


## 8.2 Combining and Merging Datasets
### Database-Style DataFrame Joins

In [25]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [26]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [27]:
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [28]:
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [29]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [30]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [31]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [32]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [33]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [34]:
pd.merge(df1, df2, how='inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


In [35]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [36]:
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [37]:
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


### Merging on Index

In [38]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [39]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [40]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [41]:
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [42]:
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio','Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})
righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                      index=[['Nevada', 'Nevada', 'Ohio', 'Ohio','Ohio', 'Ohio'],
                             [2001, 2000, 2000, 2000, 2001, 2002]],
                      columns=['event1', 'event2'])
lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0.0
1,Ohio,2001,1.0
2,Ohio,2002,2.0
3,Nevada,2001,3.0
4,Nevada,2002,4.0


In [43]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [44]:
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4,5
0,Ohio,2000,0.0,6,7
1,Ohio,2001,1.0,8,9
2,Ohio,2002,2.0,10,11
3,Nevada,2001,3.0,0,1


In [45]:
pd.merge(lefth, righth, left_on=['key1', 'key2'],
         right_index=True, how='outer')

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0
3,Nevada,2001,3.0,0.0,1.0
4,Nevada,2002,4.0,,
4,Nevada,2000,,2.0,3.0


In [46]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [47]:
right2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [48]:
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [49]:
left2.join(right2, how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [50]:
left1.join(right1, on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [51]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])
another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [52]:
left2.join([right2, another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


In [53]:
left2.join([right2, another], how='outer', sort=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  verify_integrity=True)


Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
b,,,7.0,8.0,,
c,3.0,4.0,9.0,10.0,9.0,10.0
d,,,11.0,12.0,,
e,5.0,6.0,13.0,14.0,11.0,12.0
f,,,,,16.0,17.0


# Chapter 7 problem set 2
## John

Find me Beer using this https://api.openbrewerydb.org/breweries?

1. Create a dataframe using the api address above with every brewery in the database. Will need to use page and per_page as parameters. per_page max is 50

In [54]:
import requests, re
url_base = 'https://api.openbrewerydb.org/breweries?per_page=50&page='
dat = pd.DataFrame(requests.get(url_base+'1').json())
page = 2
while True:
    new = requests.get(url_base+str(page))
    new = pd.DataFrame(new.json())
    if len(new) > 0:
        dat = dat.append(new)
        page += 1
    else:
        break
dat

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
0,micro,Birmingham,United States,2,33.524521,-86.774322,Avondale Brewing Co,2057775456,35222-1932,Alabama,201 41st St S,[],2018-08-23T23:19:57.825Z,http://www.avondalebrewing.com
1,micro,Tuscaloosa,United States,4,33.1984907123707,-87.5621551272424,Band of Brothers Brewing Company,2052665137,35401-4653,Alabama,1605 23rd Ave,[],2018-08-23T23:19:59.462Z,http://www.bandofbrosbrewing.com
2,micro,Birmingham,United States,44,33.5128492349817,-86.7914000624146,Trim Tab Brewing,2057030536,35233-3401,Alabama,2721 5th Ave S,[],2018-08-23T23:20:31.423Z,http://www.trimtabbrewing.com
3,micro,Huntsville,United States,46,34.7277523,-86.5932014,Yellowhammer Brewery,2569755950,35805-3046,Alabama,2600 Clinton Ave W,[],2018-08-23T23:20:33.102Z,http://www.yellowhammerbrewery.com
4,micro,Wasilla,United States,55,61.5752695,-149.4127103,Bearpaw River Brewing Co,,99654-7679,Alaska,4605 E Palmer Wasilla Hwy,[],2018-08-23T23:20:40.743Z,http://bearpawriverbrewing.com
5,micro,Anchorage,United States,76,61.1384893547315,-149.879076042937,King Street Brewing Co,9073365464,99515,Alaska,9050 King Street,[],2018-08-23T23:20:57.179Z,http://www.kingstreetbrewing.com
6,micro,Tucson,United States,94,32.2467372722906,-110.992750525872,1912 Brewing,5202564851,85745-1444,Arizona,2045 N Forbes Blvd Ste 105,[],2018-08-23T23:21:11.302Z,http://www.1912brewing.com
7,contract,Scottsdale,United States,98,33.4972615652174,-111.924474347826,Bad Water Brewing,5207459175,85251-3914,Arizona,4216 N Brown Ave,[],2018-08-23T23:21:15.169Z,http://www.badwaterbrewing.com
8,brewpub,Chandler,United States,104,33.3053455,-111.911126,BJs Restaurant & Brewery - Chandler,4809170631,85226-5175,Arizona,3155 W Chandler Blvd,[],2018-08-23T23:21:21.165Z,http://www.bjsrestaurants.com
9,micro,Tucson,United States,107,32.201608314954,-110.821778571134,BlackRock Brewers,5202073203,85710-6767,Arizona,1664 S Research Loop Ste 200,[],2018-08-23T23:21:23.794Z,http://www.brb.beer


2. Filter this data set down to only micro breweries in states that with begin and end with the same letter


In [55]:
pattern = r'^(.).*\1$'
regex = re.compile(r'^(.).*\1$', flags=re.IGNORECASE)
#data[dataNAs.index[dataNAs]]
dat = dat.loc[dat['brewery_type'] == 'micro']
dat = dat[dat['state'].str.contains(regex) == True]
dat

  """


Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
0,micro,Birmingham,United States,2,33.524521,-86.774322,Avondale Brewing Co,2057775456,35222-1932,Alabama,201 41st St S,[],2018-08-23T23:19:57.825Z,http://www.avondalebrewing.com
1,micro,Tuscaloosa,United States,4,33.1984907123707,-87.5621551272424,Band of Brothers Brewing Company,2052665137,35401-4653,Alabama,1605 23rd Ave,[],2018-08-23T23:19:59.462Z,http://www.bandofbrosbrewing.com
2,micro,Birmingham,United States,44,33.5128492349817,-86.7914000624146,Trim Tab Brewing,2057030536,35233-3401,Alabama,2721 5th Ave S,[],2018-08-23T23:20:31.423Z,http://www.trimtabbrewing.com
3,micro,Huntsville,United States,46,34.7277523,-86.5932014,Yellowhammer Brewery,2569755950,35805-3046,Alabama,2600 Clinton Ave W,[],2018-08-23T23:20:33.102Z,http://www.yellowhammerbrewery.com
4,micro,Wasilla,United States,55,61.5752695,-149.4127103,Bearpaw River Brewing Co,,99654-7679,Alaska,4605 E Palmer Wasilla Hwy,[],2018-08-23T23:20:40.743Z,http://bearpawriverbrewing.com
5,micro,Anchorage,United States,76,61.1384893547315,-149.879076042937,King Street Brewing Co,9073365464,99515,Alaska,9050 King Street,[],2018-08-23T23:20:57.179Z,http://www.kingstreetbrewing.com
6,micro,Tucson,United States,94,32.2467372722906,-110.992750525872,1912 Brewing,5202564851,85745-1444,Arizona,2045 N Forbes Blvd Ste 105,[],2018-08-23T23:21:11.302Z,http://www.1912brewing.com
9,micro,Tucson,United States,107,32.201608314954,-110.821778571134,BlackRock Brewers,5202073203,85710-6767,Arizona,1664 S Research Loop Ste 200,[],2018-08-23T23:21:23.794Z,http://www.brb.beer
10,micro,Tucson,United States,127,32.2504946147872,-111.005452051979,Dragoon Brewing Co,5203293606,85745-1214,Arizona,1859 W Grant Rd Ste 111,[],2018-08-23T23:21:40.563Z,http://www.dragoonbrewing.com
11,micro,Williams,United States,141,35.2500282,-112.1892168,Grand Canyon Brewing Company,8005132072,86046-2530,Arizona,233 W Route 66,[],2018-08-23T23:21:53.397Z,http://www.grandcanyonbrewingco.com


3. From the breweries found in part 2, find the farthest north, south, east, and west breweries. You may need to change the dtype of the columns

North

In [56]:
dat[['longitude', 'latitude']] = dat[['longitude', 'latitude']].apply(pd.to_numeric)
dat.loc[dat['latitude'] == dat['latitude'].max()]

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
42,micro,Fox,United States,88,64.957086,-147.621976,Silver Gulch Brewing Co,9074522739,99712,Alaska,2195 Old Steese Highway,[],2018-08-23T23:21:07.021Z,http://www.silvergulch.com


South

In [57]:
dat.loc[dat['latitude'] == dat['latitude'].min()]

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
18,micro,Gulf Shores,United States,6,30.278051,-87.683039,Big Beach Brewing Company,2519482337,36542-3104,Alabama,300 E 24th Ave,[],2018-08-23T23:20:01.170Z,http://www.bigbeachbrewing.com


East

In [58]:
dat.loc[dat['longitude'] == dat['longitude'].max()]

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
6,micro,Columbiana,United States,5355,40.888487,-80.69353,BirdFish Brewing Co,3303339385,44408-1348,Ohio,16 S Main St,[],2018-08-24T15:43:26.570Z,http://www.birdfishbrew.com


West

In [59]:
dat.loc[dat['longitude'] == dat['longitude'].min()]

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
36,micro,Kodiak,United States,78,57.790083,-152.407155,"Kodiak Island Brewing Co, LLC",9074862537,99615-6580,Alaska,117 Lower Mill Bay Rd,[],2018-08-23T23:20:58.860Z,http://www.kodiakbrewery.com


## Rie  

1. Read csv data named "sRNAalignmentToTE.ForChap7Assignment.013119.csv" in my depository.Inspect the data. (https://github.com/UCD-pbio-rclub/python-data-analysis_RieU/blob/master/sRNAalignmentToTE.ForChap7Assignment.013119.csv)

In [60]:
url = 'https://raw.githubusercontent.com/UCD-pbio-rclub/python-data-analysis_RieU/master/sRNAalignmentToTE.ForChap7Assignment.013119.csv'
dat = pd.read_csv(url)
dat.head()

Unnamed: 0,totalReads,Aligned,notAligned,libName
0,1530841,37407 (2.44%),1493434 (97.56%),cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
1,526938,125711 (23.86%),401227 (76.14%),cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
2,2889626,816373 (28.25%),2073253 (71.75%),cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
3,1541795,40918 (2.65%),1500877 (97.35%),cot-EP-EP-2.qual.clipped.noRDNAorTDNA.Gmax_189...
4,629472,140798 (22.37%),488674 (77.63%),cot-EP-EP-2.qual.clipped.noRDNAorTDNA.Gmax_189...


2. Some columns have number in percentage inside of the parentheses. Remove those parentheses, then place the values inside of parentheses to the separate columns.

In [61]:
pattern = re.compile('\(([^)]+)\)')
new = dat['Aligned'].str.split('\s', n = 2, expand = True)
dat['Aligned'] = new[1]
new = pd.Series(new[2])
new = new.str.replace('(','').str.replace(')','')
dat['Aligned_Perc'] = new
new = dat['notAligned'].str.split('\s', n = 2, expand = True)
dat['notAligned'] = new[1]
new = pd.Series(new[2])
new = new.str.replace('(','').str.replace(')','')
dat['notAligned_Perc'] = new
dat = dat[['totalReads','Aligned','Aligned_Perc',
           'notAligned','notAligned_Perc','libName']]
dat.head()

Unnamed: 0,totalReads,Aligned,Aligned_Perc,notAligned,notAligned_Perc,libName
0,1530841,37407,2.44%,1493434,97.56%,cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
1,526938,125711,23.86%,401227,76.14%,cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
2,2889626,816373,28.25%,2073253,71.75%,cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
3,1541795,40918,2.65%,1500877,97.35%,cot-EP-EP-2.qual.clipped.noRDNAorTDNA.Gmax_189...
4,629472,140798,22.37%,488674,77.63%,cot-EP-EP-2.qual.clipped.noRDNAorTDNA.Gmax_189...


### OR

In [62]:
dat = pd.read_csv(url)
dat.head()

Unnamed: 0,totalReads,Aligned,notAligned,libName
0,1530841,37407 (2.44%),1493434 (97.56%),cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
1,526938,125711 (23.86%),401227 (76.14%),cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
2,2889626,816373 (28.25%),2073253 (71.75%),cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
3,1541795,40918 (2.65%),1500877 (97.35%),cot-EP-EP-2.qual.clipped.noRDNAorTDNA.Gmax_189...
4,629472,140798 (22.37%),488674 (77.63%),cot-EP-EP-2.qual.clipped.noRDNAorTDNA.Gmax_189...


In [63]:
dat['Aligned'], dat['Aligned_Perc'] = dat.Aligned.str.split(' \(',1).str
dat['notAligned'], dat['notAligned_Perc'] = dat.notAligned.str.split(' \(',1).str
dat['Aligned_Perc'] = dat.Aligned_Perc.str.replace(')','')
dat['notAligned_Perc'] = dat.notAligned_Perc.str.replace(')','')
dat = dat[['totalReads','Aligned','Aligned_Perc',
           'notAligned','notAligned_Perc','libName']]
dat.head()

Unnamed: 0,totalReads,Aligned,Aligned_Perc,notAligned,notAligned_Perc,libName
0,1530841,37407,2.44%,1493434,97.56%,cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
1,526938,125711,23.86%,401227,76.14%,cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
2,2889626,816373,28.25%,2073253,71.75%,cot-EP-EP-1.qual.clipped.noRDNAorTDNA.Gmax_189...
3,1541795,40918,2.65%,1500877,97.35%,cot-EP-EP-2.qual.clipped.noRDNAorTDNA.Gmax_189...
4,629472,140798,22.37%,488674,77.63%,cot-EP-EP-2.qual.clipped.noRDNAorTDNA.Gmax_189...


## Kae

Let's do some regex practice!

1. Create a random DNA sequence of length 30. Using regex, please convert the DNA sequence to RNA. 

In [64]:
import random
random.choices('ACGT', k=30)

['A',
 'G',
 'A',
 'A',
 'T',
 'C',
 'T',
 'C',
 'G',
 'G',
 'G',
 'A',
 'G',
 'T',
 'C',
 'G',
 'A',
 'T',
 'C',
 'A',
 'G',
 'T',
 'G',
 'A',
 'T',
 'G',
 'C',
 'G',
 'A',
 'A']

In [65]:
DNA = ''.join(random.choices('ACGT', k=30))
DNA

'GAGCAGCCCCCCAATATCAACAACGTCCGG'

In [66]:
RNA = re.sub(pattern='T',repl='U', string=DNA)
RNA

'GAGCAGCCCCCCAAUAUCAACAACGUCCGG'

2. I work for telemarketing company and I've been given a list of phone numbers the branch needs to call today. Luckily for me, I'm only responsible for calling properly formatted (XXX)XXX-XXXX numbers with area codes of either 603 or 503. What regular expression can I use to pull these numbers from my list?
Try [this](https://pythex.org) if you're having trouble.

In [67]:
pat = re.compile('\((?:603|503)\)-[0-9]{3}-[0-9]{4}')
text = '''
(408)-345-3462
(603)-123-4636
(503)-654-3462
534-325-1234
(435)3452345
'''
for i in pat.findall(text):
    print(i)

(603)-123-4636
(503)-654-3462


## Min-Yao

1. Using the same data from last week. (Import my RNA-Seq CPM data from 'Expression Browser_CPM_practice.xlsx' file.) Remove the genes that have no expression in all samples and keep other no expression as "0". Check the data distribution in each sample.

In [68]:
dat = pd.read_excel('datasets/Expression Browser_CPM_practice.xlsx')
dat = dat.set_index('Name')
dat.head()

Unnamed: 0_level_0,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,6_t3,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Solyc00g005000.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005005.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005040.3,0.0,0.136237,0.0,0.0,0.0,0.0,0.0,0.075741,0.3031,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005050.3,49.947944,50.680249,42.857629,46.142383,40.925485,49.050587,49.010294,47.224546,42.28247,50.970454,...,46.069768,39.626619,33.60176,38.84896,28.487619,42.516459,47.526986,47.021348,51.069134,44.480035
Solyc00g005055.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
dat.shape

(34879, 45)

In [70]:
from numpy import nan as NA
dat = dat.replace(0,NA)
dat = dat.dropna(how='all')
dat = dat.replace(NA,0)
dat.head()

Unnamed: 0_level_0,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,6_t3,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Solyc00g005040.3,0.0,0.136237,0.0,0.0,0.0,0.0,0.0,0.075741,0.3031,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005050.3,49.947944,50.680249,42.857629,46.142383,40.925485,49.050587,49.010294,47.224546,42.28247,50.970454,...,46.069768,39.626619,33.60176,38.84896,28.487619,42.516459,47.526986,47.021348,51.069134,44.480035
Solyc00g005080.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.305098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005092.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.426326
Solyc00g005094.1,0.376966,0.408712,0.0,0.245438,0.0,0.0,0.414172,0.0,0.15155,0.175156,...,0.152549,0.0,0.143597,0.0,0.0,0.0,0.0,0.0,0.0,0.142109


In [71]:
dat.shape

(26530, 45)

In [72]:
dat.describe()

Unnamed: 0,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,6_t3,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
count,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,...,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0
mean,36.598147,37.105485,37.612951,34.935191,35.230142,36.39223,38.527525,38.004096,35.577524,38.712518,...,42.557067,36.053277,33.909712,35.077746,37.022215,36.349321,39.992789,37.728212,37.273658,34.006642
std,175.86016,182.554396,198.361426,180.871834,187.530635,185.152845,200.491516,172.71791,169.968162,164.421427,...,205.227539,182.206055,177.271108,184.505609,185.257155,160.316945,179.388742,167.655179,172.473552,178.841637
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.136237,0.0,0.0,0.0,0.0,0.138057,0.151482,0.15155,0.175156,...,0.152549,0.144623,0.143597,0.184995,0.111716,0.0,0.129149,0.176772,0.123355,0.142109
50%,3.769656,4.223354,4.125333,4.172449,3.87133,4.265268,3.865601,4.582334,4.698052,4.554061,...,4.271369,4.194058,4.73871,4.624876,4.580362,4.332888,4.649379,4.596072,4.687505,4.831697
75%,24.502765,24.795176,24.293629,24.052944,23.781025,23.763638,24.436118,24.50223,24.096461,24.171556,...,23.339982,24.007367,24.124341,24.234351,24.242405,23.560076,24.280091,24.217762,24.301013,24.442703
max,12737.66825,13164.33092,14629.57779,14130.12291,15457.66627,13963.87941,16121.07341,12598.99373,12763.2438,10792.59963,...,12504.43352,13895.20397,13132.97515,13235.28582,11974.18394,11524.93924,11238.19488,10325.78196,15249.07061,14770.4979


#### 2. we want to filtering outliers. Please find the genes that have any expression is higher than 15000 in any sample.

In [73]:
dat[(np.abs(dat) > 15000).any(1)]

Unnamed: 0_level_0,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,6_t3,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Solyc04g071070.2,101.026786,152.721933,129.489629,78.785664,111.162466,122.778797,164.011914,769.566954,196.105791,4508.870924,...,5902.574676,416.947231,391.876939,163.350629,550.425476,855.745289,7660.239411,1124.976912,2474.75594,173.372659
Solyc07g064160.3,7335.373974,9455.136137,11287.37027,10797.56296,10431.02124,9576.136875,9831.188897,5971.50024,7993.205342,2997.448067,...,3733.939481,11076.21839,11513.19799,11466.17816,8909.474766,5437.232249,3896.437931,8015.902431,5273.196437,12671.83607
Solyc09g010800.4,12737.66825,13164.33092,14629.57779,14130.12291,15457.66627,13963.87941,16121.07341,12598.99373,12763.2438,10792.59963,...,12504.43352,13895.20397,13132.97515,13235.28582,9783.430216,11524.93924,11238.19488,10325.78196,15249.07061,14770.4979


3. we would like to transform outliers. Please change the expression level exceeding 15000 in absolute value to 15000. Please check the new data distribution in each sample.


In [74]:
dat[np.abs(dat) > 15000] = 15000
dat.describe()

Unnamed: 0,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,6_t3,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
count,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,...,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0,26530.0
mean,36.598147,37.105485,37.612951,34.935191,35.212891,36.39223,38.485268,38.004096,35.577524,38.712518,...,42.557067,36.053277,33.909712,35.077746,37.022215,36.349321,39.992789,37.728212,37.264269,34.006642
std,175.86016,182.554396,198.361426,180.871834,186.127678,185.152845,197.192736,172.71791,169.968162,164.421427,...,205.227539,182.206055,177.271108,184.505609,185.257155,160.316945,179.388742,167.655179,171.650311,178.841637
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.136237,0.0,0.0,0.0,0.0,0.138057,0.151482,0.15155,0.175156,...,0.152549,0.144623,0.143597,0.184995,0.111716,0.0,0.129149,0.176772,0.123355,0.142109
50%,3.769656,4.223354,4.125333,4.172449,3.87133,4.265268,3.865601,4.582334,4.698052,4.554061,...,4.271369,4.194058,4.73871,4.624876,4.580362,4.332888,4.649379,4.596072,4.687505,4.831697
75%,24.502765,24.795176,24.293629,24.052944,23.781025,23.763638,24.436118,24.50223,24.096461,24.171556,...,23.339982,24.007367,24.124341,24.234351,24.242405,23.560076,24.280091,24.217762,24.301013,24.442703
max,12737.66825,13164.33092,14629.57779,14130.12291,15000.0,13963.87941,15000.0,12598.99373,12763.2438,10792.59963,...,12504.43352,13895.20397,13132.97515,13235.28582,11974.18394,11524.93924,11238.19488,10325.78196,15000.0,14770.4979


## Joel

1. Create a function to define if a string is a palindrome or not. 

In [75]:
def isPalindrome(input_string):
    new = re.sub(r'[^a-z0-9]','',input_string.lower())
    rev_new = new[::-1]
    if new == rev_new:
        print('"',input_string, '" is a palindrome', sep='')
    else:
        print('"',input_string, '" is not a palindrome', sep='')

In [76]:
isPalindrome('racecar')

"racecar" is a palindrome


In [77]:
isPalindrome('Was it a car or a cat I saw')

"Was it a car or a cat I saw" is a palindrome


In [78]:
isPalindrome('GCCATCCG')

"GCCATCCG" is not a palindrome


# HW Problem for next week

In [79]:
flights = pd.read_excel('datasets/nycflights13/flights.xlsx')
weather = pd.read_table('datasets/nycflights13/weather.tsv')
airlines = pd.read_csv('datasets/nycflights13/airlines.csv')

In [80]:
weather[['time_hour']] = weather[['time_hour']].apply(pd.to_datetime)
weather = weather.drop('hour',axis=1)
new = pd.merge(flights, airlines)
new = pd.merge(new,weather, 
               left_on = ['origin','year','month','day','time_hour'],
               right_on = ['origin','year','month','day','time_hour'])
print(new.head())
print(new.shape)

   year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0  2013      1    1     558.0             600       -2.0     924.0   
1  2013      1    1     611.0             600       11.0     945.0   
2  2013      1    1     628.0             630       -2.0    1137.0   
3  2013      1    1     656.0             659       -3.0     949.0   
4  2013      1    1     557.0             600       -3.0     838.0   

   sched_arr_time  arr_delay carrier  ...                     name   temp  \
0             917        7.0      UA  ...    United Air Lines Inc.  39.02   
1             931       14.0      UA  ...    United Air Lines Inc.  39.02   
2            1140       -3.0      AA  ...   American Airlines Inc.  39.02   
3             959      -10.0      AA  ...   American Airlines Inc.  39.02   
4             846       -8.0      B6  ...          JetBlue Airways  39.02   

    dewp  humid  wind_dir  wind_speed  wind_gust  precip pressure visib  
0  26.06  59.37     260.0    12.65858     

In [81]:
new2 = new.set_index(['year','month','day','origin'])
print(new2.head())
print(new2.shape)

                       dep_time  sched_dep_time  dep_delay  arr_time  \
year month day origin                                                  
2013 1     1   JFK        558.0             600       -2.0     924.0   
               JFK        611.0             600       11.0     945.0   
               JFK        628.0             630       -2.0    1137.0   
               JFK        656.0             659       -3.0     949.0   
               JFK        557.0             600       -3.0     838.0   

                       sched_arr_time  arr_delay carrier  flight tailnum dest  \
year month day origin                                                           
2013 1     1   JFK                917        7.0      UA     194  N29129  LAX   
               JFK                931       14.0      UA     303  N532UA  SFO   
               JFK               1140       -3.0      AA     413  N3BAAA  SJU   
               JFK                959      -10.0      AA    1815  N5FMAA  MCO   
         