## Concatenating Along an Axis

In [1]:
import numpy as np
import pandas as pd

In [2]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [3]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [4]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
print(s1)
print(s2)
print(s3)
pd.concat([s1,s2,s3])

a    0
b    1
dtype: int64
c    2
d    3
e    4
dtype: int64
f    5
g    6
dtype: int64


a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [5]:
pd.concat([s1, s2, s3], axis=1, sort=True)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [6]:
s4 = pd.concat([s1, s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [7]:
pd.concat([s1, s4], axis=1, sort=True)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [8]:
pd.concat([s1, s4], axis=1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,1


In [9]:
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,1.0
e,,


In [10]:
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [11]:
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [12]:
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'], sort=True)

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [13]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [14]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four'])
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [15]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],sort=True)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [16]:
pd.concat({'level1': df1, 'level2': df2}, axis=1, sort=True)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [17]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],names=['upper', 'lower'], sort=True)

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [18]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df1

Unnamed: 0,a,b,c,d
0,1.07344,0.140652,0.20664,-1.669721
1,0.096926,0.024758,1.016471,-3.018453
2,0.103581,-0.443821,0.401455,-0.125793


In [19]:
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
df2

Unnamed: 0,b,d,a
0,-1.411391,2.186646,0.19558
1,-0.91412,0.516333,2.739942


In [20]:
pd.concat([df1, df2], sort=True)

Unnamed: 0,a,b,c,d
0,1.07344,0.140652,0.20664,-1.669721
1,0.096926,0.024758,1.016471,-3.018453
2,0.103581,-0.443821,0.401455,-0.125793
0,0.19558,-1.411391,,2.186646
1,2.739942,-0.91412,,0.516333


In [21]:
pd.concat([df1, df2], ignore_index=True, sort=True)

Unnamed: 0,a,b,c,d
0,1.07344,0.140652,0.20664,-1.669721
1,0.096926,0.024758,1.016471,-3.018453
2,0.103581,-0.443821,0.401455,-0.125793
3,0.19558,-1.411391,,2.186646
4,2.739942,-0.91412,,0.516333


## Combining Data with Overlap

In [22]:
a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],index=['f', 'e', 'd', 'c', 'b', 'a'])
a

f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64

In [23]:
b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],index=['a', 'b', 'c', 'd', 'e', 'f'])
b

a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64

In [24]:
np.where(pd.isnull(a), b, a)
#If a is null use b otherwise use a

array([0. , 2.5, 0. , 3.5, 4.5, 5. ])

In [25]:
b.combine_first(a)

a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64

In [26]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [27]:
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [28]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


# 8.3 Reshaping and Pivoting
## Reshaping with Hierarchical Indexing

In [29]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'],
                    name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [30]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [31]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [32]:
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [33]:
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [34]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s1

a    0
b    1
c    2
d    3
dtype: int64

In [35]:
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
s2

c    4
d    5
e    6
dtype: int64

In [36]:
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [37]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [38]:
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [39]:
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

In [40]:
df = pd.DataFrame({'left': result, 'right': result + 5},
                  columns=pd.Index(['left', 'right'], name='side'))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [41]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [42]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


## Pivoting “Long” to “Wide” Format

In [43]:
data = pd.read_csv('examples/macrodata.csv')
data.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [44]:
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
periods

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')

In [45]:
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')
columns

Index(['realgdp', 'infl', 'unemp'], dtype='object', name='item')

In [46]:
data = data.reindex(columns=columns)
data.head()

item,realgdp,infl,unemp
0,2710.349,0.0,5.8
1,2778.801,2.34,5.1
2,2775.488,2.74,5.3
3,2785.204,0.27,5.6
4,2847.699,2.31,5.2


In [47]:
data.index = periods.to_timestamp('D', 'end')
data.head()

item,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,2710.349,0.0,5.8
1959-06-30,2778.801,2.34,5.1
1959-09-30,2775.488,2.74,5.3
1959-12-31,2785.204,0.27,5.6
1960-03-31,2847.699,2.31,5.2


In [48]:
ldata = data.stack().reset_index().rename(columns={0: 'value'})
ldata.head()

Unnamed: 0,date,item,value
0,1959-03-31,realgdp,2710.349
1,1959-03-31,infl,0.0
2,1959-03-31,unemp,5.8
3,1959-06-30,realgdp,2778.801
4,1959-06-30,infl,2.34


In [49]:
ldata[:10]

Unnamed: 0,date,item,value
0,1959-03-31,realgdp,2710.349
1,1959-03-31,infl,0.0
2,1959-03-31,unemp,5.8
3,1959-06-30,realgdp,2778.801
4,1959-06-30,infl,2.34
5,1959-06-30,unemp,5.1
6,1959-09-30,realgdp,2775.488
7,1959-09-30,infl,2.74
8,1959-09-30,unemp,5.3
9,1959-12-31,realgdp,2785.204


In [50]:
pivoted = ldata.pivot('date', 'item', 'value')
pivoted

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,0.00,2710.349,5.8
1959-06-30,2.34,2778.801,5.1
1959-09-30,2.74,2775.488,5.3
1959-12-31,0.27,2785.204,5.6
1960-03-31,2.31,2847.699,5.2
1960-06-30,0.14,2834.390,5.2
1960-09-30,2.70,2839.022,5.6
1960-12-31,1.21,2802.616,6.3
1961-03-31,-0.40,2819.264,6.8
1961-06-30,1.47,2872.005,7.0


In [51]:
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]

Unnamed: 0,date,item,value,value2
0,1959-03-31,realgdp,2710.349,-0.441312
1,1959-03-31,infl,0.0,0.337747
2,1959-03-31,unemp,5.8,2.095088
3,1959-06-30,realgdp,2778.801,1.181379
4,1959-06-30,infl,2.34,-0.777696
5,1959-06-30,unemp,5.1,2.12043
6,1959-09-30,realgdp,2775.488,0.515841
7,1959-09-30,infl,2.74,0.538785
8,1959-09-30,unemp,5.3,2.339774
9,1959-12-31,realgdp,2785.204,0.029368


In [52]:
pivoted = ldata.pivot('date', 'item')
pivoted[:5]

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31,0.0,2710.349,5.8,0.337747,-0.441312,2.095088
1959-06-30,2.34,2778.801,5.1,-0.777696,1.181379,2.12043
1959-09-30,2.74,2775.488,5.3,0.538785,0.515841,2.339774
1959-12-31,0.27,2785.204,5.6,-0.770152,0.029368,-1.570475
1960-03-31,2.31,2847.699,5.2,1.621174,1.632525,-0.126511


In [53]:
pivoted['value'][:5]

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,0.0,2710.349,5.8
1959-06-30,2.34,2778.801,5.1
1959-09-30,2.74,2775.488,5.3
1959-12-31,0.27,2785.204,5.6
1960-03-31,2.31,2847.699,5.2


In [54]:
unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31,0.0,2710.349,5.8,0.337747,-0.441312,2.095088
1959-06-30,2.34,2778.801,5.1,-0.777696,1.181379,2.12043
1959-09-30,2.74,2775.488,5.3,0.538785,0.515841,2.339774
1959-12-31,0.27,2785.204,5.6,-0.770152,0.029368,-1.570475
1960-03-31,2.31,2847.699,5.2,1.621174,1.632525,-0.126511
1960-06-30,0.14,2834.39,5.2,-0.363048,0.131994,0.50063
1960-09-30,2.7,2839.022,5.6,0.817735,-0.269541,0.143446


## Pivoting “Wide” to “Long” Format

In [55]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [56]:
melted = pd.melt(df, ['key'])
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [57]:
reshaped = melted.pivot('key', 'variable', 'value')
reshaped

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


In [58]:
reshaped.reset_index()

variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


In [59]:
pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6


In [60]:
pd.melt(df, value_vars=['A', 'B', 'C'])

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


In [61]:
pd.melt(df, value_vars=['key', 'A', 'B'])

Unnamed: 0,variable,value
0,key,foo
1,key,bar
2,key,baz
3,A,1
4,A,2
5,A,3
6,B,4
7,B,5
8,B,6


## Homework Problems

## John

1. Read these 3 files into python and make them data frames: [flights.xlsx](https://github.com/UCD-pbio-rclub/python-data-analysis_JohnD/raw/master/datasets/nycflights13/flights.xlsx), [weather.tsv](https://github.com/UCD-pbio-rclub/python-data-analysis_JohnD/raw/master/datasets/nycflights13/weather.tsv), [airlines.csv](https://github.com/UCD-pbio-rclub/python-data-analysis_JohnD/raw/master/datasets/nycflights13/airlines.csv)

In [62]:
flights = pd.read_excel('datasets/nycflights13/flights.xlsx')
weather = pd.read_table('datasets/nycflights13/weather.tsv')
airlines = pd.read_csv('datasets/nycflights13/airlines.csv')

2. Remove the 'hour' column from the weather data frame

In [63]:
weather = weather.drop('hour',axis=1)

3. Inner join these three data frames. You will have to modify certain columns and be explicit about which columns to merge on. (should have 29 columns in the end)

In [64]:
weather[['time_hour']] = weather[['time_hour']].apply(pd.to_datetime)
new = pd.merge(flights, airlines)
new = pd.merge(new,weather, 
               left_on = ['origin','year','month','day','time_hour'],
               right_on = ['origin','year','month','day','time_hour'])
print(new.head())
print(new.shape)

   year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0  2013      1    1     558.0             600       -2.0     924.0   
1  2013      1    1     611.0             600       11.0     945.0   
2  2013      1    1     628.0             630       -2.0    1137.0   
3  2013      1    1     656.0             659       -3.0     949.0   
4  2013      1    1     557.0             600       -3.0     838.0   

   sched_arr_time  arr_delay carrier  ...                     name   temp  \
0             917        7.0      UA  ...    United Air Lines Inc.  39.02   
1             931       14.0      UA  ...    United Air Lines Inc.  39.02   
2            1140       -3.0      AA  ...   American Airlines Inc.  39.02   
3             959      -10.0      AA  ...   American Airlines Inc.  39.02   
4             846       -8.0      B6  ...          JetBlue Airways  39.02   

    dewp  humid  wind_dir  wind_speed  wind_gust  precip pressure visib  
0  26.06  59.37     260.0    12.65858     

4. Set year,month,day,and origin to indexes

In [65]:
new2 = new.set_index(['year','month','day','origin'])
print(new2.head())
print(new2.shape)

                       dep_time  sched_dep_time  dep_delay  arr_time  \
year month day origin                                                  
2013 1     1   JFK        558.0             600       -2.0     924.0   
               JFK        611.0             600       11.0     945.0   
               JFK        628.0             630       -2.0    1137.0   
               JFK        656.0             659       -3.0     949.0   
               JFK        557.0             600       -3.0     838.0   

                       sched_arr_time  arr_delay carrier  flight tailnum dest  \
year month day origin                                                           
2013 1     1   JFK                917        7.0      UA     194  N29129  LAX   
               JFK                931       14.0      UA     303  N532UA  SFO   
               JFK               1140       -3.0      AA     413  N3BAAA  SJU   
               JFK                959      -10.0      AA    1815  N5FMAA  MCO   
         

## Min-Yao

### This question is related to Hierarchical Indexing.

1. Using the same data from last week. (Import my RNA-Seq CPM data from 'Expression Browser_CPM_practice.xlsx' file. [Expression Browser_CPM_practice.xlsx](https://github.com/UCD-pbio-rclub/python-data-analysis_MinYaoJ/blob/master/Expression%20Browser_CPM_practice.xlsx)) In column labels, the first number means plant genotype and the second part is one letter and a number which means the treatment conditions and sample numbers. please change them to multilevel hierarchical columns labels with column name in front of the whole data. For example, '6_c1' change it to genotype = '6', treatments = 'c', sample_number = '1'.

In [66]:
dat = pd.read_excel('https://github.com/UCD-pbio-rclub/python-data-analysis_MinYaoJ/raw/master/Expression%20Browser_CPM_practice.xlsx')
dat = dat.set_index('Name')
dat.head()

Unnamed: 0_level_0,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,6_t3,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Solyc00g005000.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005005.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005040.3,0.0,0.136237,0.0,0.0,0.0,0.0,0.0,0.075741,0.3031,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005050.3,49.947944,50.680249,42.857629,46.142383,40.925485,49.050587,49.010294,47.224546,42.28247,50.970454,...,46.069768,39.626619,33.60176,38.84896,28.487619,42.516459,47.526986,47.021348,51.069134,44.480035
Solyc00g005055.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
dat2 = dat.T
dat2 = dat2.reset_index()
samples = dat2['index'].apply(lambda x: pd.Series(list(x.strip().replace('_',''))))
samples.columns= ['genotype','treatment','sample_number']
dat2 = dat2.drop(columns = ['index'])
dat2 = pd.concat([dat2,samples], axis=1)
dat2 = dat2.set_index(['genotype','treatment','sample_number'])
dat2 = dat2.T
dat2.index.names = ['Name']
dat2.head()

genotype,6,6,6,6,6,6,6,6,6,6,...,3,2,2,2,2,2,2,2,2,2
treatment,c,c,c,c,c,c,c,t,t,t,...,t,c,c,c,c,c,t,t,t,t
sample_number,1,2,3,4,5,6,7,1,2,3,...,5,1,2,3,4,5,1,2,3,4
Name,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
Solyc00g005000.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005005.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005040.3,0.0,0.136237,0.0,0.0,0.0,0.0,0.0,0.075741,0.3031,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005050.3,49.947944,50.680249,42.857629,46.142383,40.925485,49.050587,49.010294,47.224546,42.28247,50.970454,...,46.069768,39.626619,33.60176,38.84896,28.487619,42.516459,47.526986,47.021348,51.069134,44.480035
Solyc00g005055.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2. In row labels, these numbers are Solyc ID for tomatoes. "Solyc" is the 5 letter abbreviation of Solanum lycopersicum, the 2 digit number following the 'Solyc' denotes the chromosome, 'g' denotes that the sequence is a gene, and the 6 digit number following the 'g' identifies the gene on the chromosome. The '.1' denotes the annotation version number of the locus. Please use hierarchical indexing to label another level of index to show which chromosome each gene on and give the index name 'chromosome'.

In [68]:
chromosome = pd.Series(pd.Series(dat2.index).apply(lambda x: pd.Series(x[5:7]))[0])
name = pd.Series(dat2.index)
dat2 = dat2.set_index([name, chromosome])
dat2.index.names = ['Name','chromosome']
dat2.head()

Unnamed: 0_level_0,genotype,6,6,6,6,6,6,6,6,6,6,...,3,2,2,2,2,2,2,2,2,2
Unnamed: 0_level_1,treatment,c,c,c,c,c,c,c,t,t,t,...,t,c,c,c,c,c,t,t,t,t
Unnamed: 0_level_2,sample_number,1,2,3,4,5,6,7,1,2,3,...,5,1,2,3,4,5,1,2,3,4
Name,chromosome,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
Solyc00g005000.3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005005.1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005040.3,0,0.0,0.136237,0.0,0.0,0.0,0.0,0.0,0.075741,0.3031,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005050.3,0,49.947944,50.680249,42.857629,46.142383,40.925485,49.050587,49.010294,47.224546,42.28247,50.970454,...,46.069768,39.626619,33.60176,38.84896,28.487619,42.516459,47.526986,47.021348,51.069134,44.480035
Solyc00g005055.1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


3. In order to compare genotypes and treatments, you want to know the summary statistics of different genotypes and treatments. Please calculate the average expression level of each gene in different genotypes and treatments.

In [69]:
dat3 = dat2.stack([0,1,2])
dat3.groupby(['genotype']).describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,313911.0,27.671162,154.811283,0.0,0.0,0.73998,15.063526,15249.07061
3,348790.0,28.874672,160.694488,0.0,0.0,0.694779,15.169646,16068.80831
5,418548.0,30.166049,181.909613,0.0,0.0,0.653776,14.905083,20151.52348
6,488306.0,28.098021,156.406898,0.0,0.0,0.681186,14.947549,16121.07341


In [70]:
dat3.groupby(['treatment']).describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
c,767338.0,27.691415,166.687344,0.0,0.0,0.626216,15.033326,17197.27489
t,802217.0,29.736562,161.814741,0.0,0.0,0.725279,15.021953,20151.52348


In [71]:
dat3.groupby(['genotype','treatment']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
genotype,treatment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,c,174395.0,27.141131,156.11703,0.0,0.0,0.73998,14.934116,13895.20397
2,t,139516.0,28.3337,153.161425,0.0,0.0,0.740132,15.202391,15249.07061
3,c,174395.0,27.711733,160.449708,0.0,0.0,0.635383,15.285141,16068.80831
3,t,174395.0,30.037611,160.930952,0.0,0.0,0.72931,15.107055,13720.27099
5,c,174395.0,27.984023,185.409603,0.0,0.0,0.651382,15.275602,17197.27489
5,t,244153.0,31.724639,179.351941,0.0,0.0,0.655992,14.749825,20151.52348
6,c,244153.0,27.860957,164.228163,0.0,0.0,0.553047,14.726292,16121.07341
6,t,244153.0,28.335086,148.173325,0.0,0.0,0.763341,15.155007,15127.03715


In [72]:
#dat3.groupby(['Name','genotype','treatment']).describe()
#Takes forever

In [73]:
dat3.head(n=190).groupby(['Name','genotype','treatment']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
Name,genotype,treatment,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Solyc00g005000.3,2,c,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005000.3,2,t,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005000.3,3,c,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005000.3,3,t,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005000.3,5,c,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005000.3,5,t,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005000.3,6,c,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005000.3,6,t,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005005.1,2,c,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005005.1,2,t,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Joel

These datasets contain information on US population by state/region over a period of time, as well as the state's area.

```python
population = pd.read_csv("https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv")
areas = pd.read_csv("https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv")
stateabrevs = pd.read_csv("https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv")
```

Calculate the population density for the year 2012 (using the _total_ population)

In [74]:
population = pd.read_csv("https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv")
areas = pd.read_csv("https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv")
stateabrevs = pd.read_csv("https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv")

In [75]:
print(population.head())
print(areas.head())
print(stateabrevs.head())

  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA


In [76]:
new = pd.merge(pd.merge(stateabrevs,areas),population, left_on='abbreviation',right_on='state/region')
new.head()

Unnamed: 0,state,abbreviation,area (sq. mi),state/region,ages,year,population
0,Alabama,AL,52423,AL,under18,2012,1117489.0
1,Alabama,AL,52423,AL,total,2012,4817528.0
2,Alabama,AL,52423,AL,under18,2010,1130966.0
3,Alabama,AL,52423,AL,total,2010,4785570.0
4,Alabama,AL,52423,AL,under18,2011,1125763.0


In [77]:
new_2012 = new[new['ages'] == 'total']
new_2012 = new_2012[new_2012['year'] == 2012]
new_2012.head()

Unnamed: 0,state,abbreviation,area (sq. mi),state/region,ages,year,population
1,Alabama,AL,52423,AL,total,2012,4817528.0
95,Alaska,AK,656425,AK,total,2012,730307.0
97,Arizona,AZ,114006,AZ,total,2012,6551149.0
191,Arkansas,AR,53182,AR,total,2012,2949828.0
193,California,CA,163707,CA,total,2012,37999878.0


In [78]:
new_2012['density'] = new_2012['population']/new_2012['area (sq. mi)']
new_2012.head()

Unnamed: 0,state,abbreviation,area (sq. mi),state/region,ages,year,population,density
1,Alabama,AL,52423,AL,total,2012,4817528.0,91.897221
95,Alaska,AK,656425,AK,total,2012,730307.0,1.112552
97,Arizona,AZ,114006,AZ,total,2012,6551149.0,57.463195
191,Arkansas,AR,53182,AR,total,2012,2949828.0,55.466662
193,California,CA,163707,CA,total,2012,37999878.0,232.121278


In [79]:
new_2012['density'].describe()

count      51.000000
mean      347.542324
std      1297.331909
min         1.112552
25%        44.744232
50%        91.897221
75%       206.797246
max      9315.102941
Name: density, dtype: float64

## Kae 

Using these baby name datasets from [1996](https://raw.githubusercontent.com/wesm/pydata-book/2nd-edition/datasets/babynames/yob1996.txt) and [1998](https://raw.githubusercontent.com/wesm/pydata-book/2nd-edition/datasets/babynames/yob1998.txt), make dataframes using the columns for name and the number of babies given that name. Merge the lists and see if there were any names used in one year but not the other. (I'm having trouble with this -- still working on it -- it can be done, right?)

You might need to parse the lists down a bit for faster testing.

In [80]:
year_1996 = pd.read_csv('https://raw.githubusercontent.com/wesm/pydata-book/2nd-edition/datasets/babynames/yob1996.txt', 
                        names = ['Name','Sex','Count'])
year_1998 = pd.read_csv('https://raw.githubusercontent.com/wesm/pydata-book/2nd-edition/datasets/babynames/yob1998.txt',
                        names = ['Name','Sex','Count'])
year_1996.head()

Unnamed: 0,Name,Sex,Count
0,Emily,F,25144
1,Jessica,F,24180
2,Ashley,F,23676
3,Sarah,F,21012
4,Samantha,F,20541


In [81]:
year_1998.head()

Unnamed: 0,Name,Sex,Count
0,Emily,F,26174
1,Hannah,F,21361
2,Samantha,F,20187
3,Ashley,F,19865
4,Sarah,F,19859


In [82]:
all = pd.merge(year_1996,year_1998, on = 'Name', how = 'outer', indicator=True)
all.head()

Unnamed: 0,Name,Sex_x,Count_x,Sex_y,Count_y,_merge
0,Emily,F,25144.0,F,26174.0,both
1,Emily,F,25144.0,M,38.0,both
2,Emily,M,34.0,F,26174.0,both
3,Emily,M,34.0,M,38.0,both
4,Jessica,F,24180.0,F,18229.0,both


In [83]:
all[all['_merge'] != 'both']

Unnamed: 0,Name,Sex_x,Count_x,Sex_y,Count_y,_merge
5979,Crisol,F,30.0,,,left_only
6254,Yamiles,F,29.0,,,left_only
6285,Chantay,F,28.0,,,left_only
7180,Maribi,F,23.0,,,left_only
7220,Rolonda,F,23.0,,,left_only
7439,Staphany,F,22.0,,,left_only
7453,Yilda,F,22.0,,,left_only
7796,Marili,F,20.0,,,left_only
7898,Accacia,F,19.0,,,left_only
7945,Chabeli,F,19.0,,,left_only


In [84]:
pd.merge(pd.read_csv('https://raw.githubusercontent.com/wesm/pydata-book/2nd-edition/datasets/babynames/yob1996.txt', 
                        names = ['Name','Sex','Count']),
        pd.read_csv('https://raw.githubusercontent.com/wesm/pydata-book/2nd-edition/datasets/babynames/yob1998.txt', 
                        names = ['Name','Sex','Count']),
        on = 'Name', how = 'outer', indicator=True).query('_merge != "both"')['Name']

5979           Crisol
6254          Yamiles
6285          Chantay
7180           Maribi
7220          Rolonda
7439         Staphany
7453            Yilda
7796           Marili
7898          Accacia
7945          Chabeli
8142           Ronika
8324           Jahnai
8347          Jestine
8564            Aneta
8596           Briani
8733            Letha
8740         Lucerito
8820         Suzzette
8967           Damali
9222          Shanera
9235           Sindia
9432            Genny
9660            Shree
9733            Abrea
9832          Dajanay
9984        Magdaline
9990              Mao
10005        Meleigha
10028           Neysa
10032            Nomi
             ...     
36304         Vinayak
36305          Vinton
36306           Viren
36307           Waqas
36308            Wayd
36309          Welton
36310         Whitten
36311    Williamjames
36312          Woodie
36313           Wylee
36314          Yackov
36315           Yafet
36316          Yaniel
36317              Ye
36318     

## Rie

1. Read csv file, named "Rie_Chap8part1Data.021319.csv"(https://github.com/UCD-pbio-rclub/python-data-analysis_RieU/blob/master/Rie_Chap8part1Data.021319.csv). Set two indexes for the columns 'miR' and 'category'

In [85]:
dat = pd.read_csv('https://github.com/UCD-pbio-rclub/python-data-analysis_RieU/raw/master/Rie_Chap8part1Data.021319.csv')
dat.head()

Unnamed: 0,miR,Target,category,cor
0,"gma-miR156(k,n,o)",Glyma06g36140,known,-0.793254
1,"gma-miR156(k,n,o)",Glyma03g29901,known,-0.783733
2,"gma-miR156(a,h,u,v,w,x,y)",Glyma06g36140,known,-0.78194
3,"gma-miR156(c,d,i,j,l,m)",Glyma01g08056,known,-0.758367
4,"gma-miR396(b-5p,c,k-5p)",Glyma01g02880,known,-0.751431


In [86]:
dat.set_index(['miR','category'], inplace=True)
dat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Target,cor
miR,category,Unnamed: 2_level_1,Unnamed: 3_level_1
"gma-miR156(k,n,o)",known,Glyma06g36140,-0.793254
"gma-miR156(k,n,o)",known,Glyma03g29901,-0.783733
"gma-miR156(a,h,u,v,w,x,y)",known,Glyma06g36140,-0.78194
"gma-miR156(c,d,i,j,l,m)",known,Glyma01g08056,-0.758367
"gma-miR396(b-5p,c,k-5p)",known,Glyma01g02880,-0.751431


2. Select the data based on the category (known and novel). Hint: I reshape the data first. 

In [87]:
dat.query('category == "known" or category == "novel"')

Unnamed: 0_level_0,Unnamed: 1_level_0,Target,cor
miR,category,Unnamed: 2_level_1,Unnamed: 3_level_1
"gma-miR156(k,n,o)",known,Glyma06g36140,-0.793254
"gma-miR156(k,n,o)",known,Glyma03g29901,-0.783733
"gma-miR156(a,h,u,v,w,x,y)",known,Glyma06g36140,-0.781940
"gma-miR156(c,d,i,j,l,m)",known,Glyma01g08056,-0.758367
"gma-miR396(b-5p,c,k-5p)",known,Glyma01g02880,-0.751431
"gma-miR160(a-5p,f)",known,Glyma14g33730,-0.745421
"gma-miR160(a-5p,f)",known,Glyma13g02410,-0.743880
"gma-miR390(b-5p,d)",novel,Glyma20g25540,-0.736842
"gma-miR156(p,t)",known,Glyma18g36960,-0.719069
"gma-miR156(a,h,u,v,w,x,y)",known,Glyma12g27330,-0.715221
