# 7.2 Data Transformation
## Detecting and Filtering Outliers

In [356]:
import numpy as np
import pandas as pd

In [357]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.050974,0.014706,0.005708,-0.0362
std,0.996084,0.999874,0.994908,0.99138
min,-3.64586,-3.184377,-3.745356,-3.428254
25%,-0.600254,-0.617533,-0.65737,-0.73102
50%,0.056212,-0.026392,-0.012007,-0.080713
75%,0.760962,0.679396,0.680673,0.622384
max,2.653656,3.525865,2.735527,3.366626


In [358]:
col = data[2]
col

0      0.000369
1      1.129120
2     -1.397820
3     -1.263437
4      0.513393
5     -0.178355
6      0.156632
7     -0.486323
8     -0.360410
9     -0.093100
10     2.616101
11    -0.676853
12    -0.502827
13     0.680397
14     0.473505
15    -1.100570
16    -0.674560
17    -0.994480
18     0.543406
19    -0.313704
20    -0.900887
21     0.947681
22     0.090325
23     0.139845
24     0.604578
25    -0.124668
26    -1.596128
27    -0.319537
28    -0.571681
29     1.060941
         ...   
970    0.246231
971    0.054856
972    0.653339
973    1.854922
974    0.005182
975    0.668397
976   -1.512690
977    0.712390
978   -0.699837
979   -1.018323
980   -0.777942
981    0.892854
982    0.450832
983    1.416312
984    1.422611
985    0.705443
986    0.871587
987   -0.284841
988    0.720206
989   -0.096805
990   -1.163720
991   -0.527668
992   -2.121482
993   -0.987719
994    0.175361
995   -1.315935
996   -0.280699
997   -0.559406
998   -0.597871
999   -0.895591
Name: 2, Length: 1000, d

In [359]:
col[np.abs(col) > 3]

124   -3.399312
219   -3.745356
Name: 2, dtype: float64

In [360]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
124,0.457246,-0.025907,-3.399312,-0.974657
143,1.951312,3.260383,0.963301,1.201206
219,0.508391,-0.196713,-3.745356,-1.520113
318,-0.242459,-3.05699,1.918403,-0.578828
341,0.682841,0.326045,0.425384,-3.428254
405,1.179227,-3.184377,1.369891,-1.074833
627,-3.548824,1.553205,-2.186301,1.277104
718,-0.578093,0.193299,1.397822,3.366626
865,-0.207434,3.525865,0.28307,0.544635
886,-3.64586,0.255475,-0.549574,-1.907459


In [361]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.052169,0.014161,0.006852,-0.036138
std,0.992051,0.996599,0.991078,0.988811
min,-3.0,-3.0,-3.0,-3.0
25%,-0.600254,-0.617533,-0.65737,-0.73102
50%,0.056212,-0.026392,-0.012007,-0.080713
75%,0.760962,0.679396,0.680673,0.622384
max,2.653656,3.0,2.735527,3.0


In [362]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,1.0,1.0
1,-1.0,-1.0,1.0,1.0
2,-1.0,1.0,-1.0,1.0
3,-1.0,-1.0,-1.0,1.0
4,-1.0,-1.0,1.0,1.0


## Permutation and Random Sampling

In [363]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [364]:
sampler = np.random.permutation(5)
sampler

array([0, 2, 3, 4, 1])

In [365]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [366]:
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
1,4,5,6,7


In [367]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11


In [368]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

1    7
0    5
3    6
3    6
3    6
0    5
1    7
3    6
0    5
2   -1
dtype: int64

## Computing Indicator/Dummy Variables

In [369]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [370]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [371]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [372]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [373]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat',
                       sep='::',
                       header=None,
                       names=mnames,
                       engine ='python')
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [374]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thrill

In [375]:
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [376]:
zero_matrix = np.zeros((len(movies), len(genres)))
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [377]:
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [378]:
gen = movies.genres[0]
gen

"Animation|Children's|Comedy"

In [379]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [380]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

In [381]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [382]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [383]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [384]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


# 7.3 String Manipulation
## String Object Methods

In [385]:
val = 'a,b,  guido'
val

'a,b,  guido'

In [386]:
val.split(',')

['a', 'b', '  guido']

In [387]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [388]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [389]:
'::'.join(pieces)

'a::b::guido'

In [390]:
'guido' in val

True

In [391]:
val.index(',')

1

In [392]:
val.find(':')

-1

In [394]:
val.index(':')

ValueError: substring not found

In [395]:
val.count(',')

2

In [396]:
val.replace(',', '::')

'a::b::  guido'

In [397]:
val.replace(',', '')

'ab  guido'

## Regular Expressions

In [398]:
import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [399]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [400]:
regex.findall(text)

['    ', '\t ', '  \t']

In [401]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [402]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [403]:
text[m.start():m.end()]

'dave@google.com'

In [404]:
print(regex.match(text))

None


In [405]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [406]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

In [407]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [408]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



## Vectorized String Functions in pandas

In [409]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [410]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [411]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [412]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [413]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]
matches

Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [414]:
matches.str.get(1)

Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

In [415]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [418]:
data.str.extract(pattern, flags=re.IGNORECASE)

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,


# Ch. 7 pt. 1 HW
## John
Practice with missing data
1. Create a DataFrame using
```python
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10, 10), columns= ['a','b','c','d','e','f','g','h','i','j'])
df.iloc[:7, 1:3] = NA
df.iloc[4:8, 5:8] = NA
df.iloc[:,9] = NA
df
```

In [419]:
from numpy import NAN as NA
df = pd.DataFrame(np.random.randn(10, 10), columns= ['a','b','c','d','e','f','g','h','i','j'])
df.iloc[:7, 1:3] = np.NAN
df.iloc[4:8, 5:8] = np.NAN
df.iloc[:,9] = np.NAN
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1.007189,,,0.228913,1.352917,0.886429,-2.001637,-0.371843,1.669025,
1,-0.539741,,,-1.021228,-0.577087,0.124121,0.302614,0.523772,0.00094,
2,-0.713544,,,-1.860761,-0.860757,0.560145,-1.265934,0.119827,-1.063512,
3,-2.359419,,,-0.970736,-1.30703,0.28635,0.377984,-0.753887,0.331286,
4,0.069877,,,1.004812,1.327195,,,,0.758363,
5,0.86258,,,0.670216,0.852965,,,,-0.652469,
6,-1.33261,,,0.690002,1.001543,,,,-0.726213,
7,0.051316,-1.157719,0.816707,0.43361,1.010737,,,,-0.131578,
8,0.188211,2.169461,-0.114928,2.003697,0.02961,0.795253,0.11811,-0.748532,0.58497,
9,-1.565657,-0.56254,-0.032664,-0.929006,-0.482573,-0.036264,1.09539,0.980928,-0.589488,


2. Remove all rows where at least half the columns have missing data

In [420]:
df.dropna(thresh = df.shape[0]/2)

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1.007189,,,0.228913,1.352917,0.886429,-2.001637,-0.371843,1.669025,
1,-0.539741,,,-1.021228,-0.577087,0.124121,0.302614,0.523772,0.00094,
2,-0.713544,,,-1.860761,-0.860757,0.560145,-1.265934,0.119827,-1.063512,
3,-2.359419,,,-0.970736,-1.30703,0.28635,0.377984,-0.753887,0.331286,
7,0.051316,-1.157719,0.816707,0.43361,1.010737,,,,-0.131578,
8,0.188211,2.169461,-0.114928,2.003697,0.02961,0.795253,0.11811,-0.748532,0.58497,
9,-1.565657,-0.56254,-0.032664,-0.929006,-0.482573,-0.036264,1.09539,0.980928,-0.589488,


3. Remove all columns where at least half the rows have missing data

In [421]:
df.dropna(axis = 1, thresh = df.shape[1]/2)

Unnamed: 0,a,d,e,f,g,h,i
0,1.007189,0.228913,1.352917,0.886429,-2.001637,-0.371843,1.669025
1,-0.539741,-1.021228,-0.577087,0.124121,0.302614,0.523772,0.00094
2,-0.713544,-1.860761,-0.860757,0.560145,-1.265934,0.119827,-1.063512
3,-2.359419,-0.970736,-1.30703,0.28635,0.377984,-0.753887,0.331286
4,0.069877,1.004812,1.327195,,,,0.758363
5,0.86258,0.670216,0.852965,,,,-0.652469
6,-1.33261,0.690002,1.001543,,,,-0.726213
7,0.051316,0.43361,1.010737,,,,-0.131578
8,0.188211,2.003697,0.02961,0.795253,0.11811,-0.748532,0.58497
9,-1.565657,-0.929006,-0.482573,-0.036264,1.09539,0.980928,-0.589488


4. Combine problems 2 and 3. Does the order matter?

In [422]:
df.dropna(thresh = df.shape[0]/2).dropna(axis = 1, thresh = df.shape[1]/2)

Unnamed: 0,a,d,e,f,g,h,i
0,1.007189,0.228913,1.352917,0.886429,-2.001637,-0.371843,1.669025
1,-0.539741,-1.021228,-0.577087,0.124121,0.302614,0.523772,0.00094
2,-0.713544,-1.860761,-0.860757,0.560145,-1.265934,0.119827,-1.063512
3,-2.359419,-0.970736,-1.30703,0.28635,0.377984,-0.753887,0.331286
7,0.051316,0.43361,1.010737,,,,-0.131578
8,0.188211,2.003697,0.02961,0.795253,0.11811,-0.748532,0.58497
9,-1.565657,-0.929006,-0.482573,-0.036264,1.09539,0.980928,-0.589488


In [423]:
df.dropna(axis = 1, thresh = df.shape[1]/2).dropna(thresh = df.shape[0]/2)

Unnamed: 0,a,d,e,f,g,h,i
0,1.007189,0.228913,1.352917,0.886429,-2.001637,-0.371843,1.669025
1,-0.539741,-1.021228,-0.577087,0.124121,0.302614,0.523772,0.00094
2,-0.713544,-1.860761,-0.860757,0.560145,-1.265934,0.119827,-1.063512
3,-2.359419,-0.970736,-1.30703,0.28635,0.377984,-0.753887,0.331286
8,0.188211,2.003697,0.02961,0.795253,0.11811,-0.748532,0.58497
9,-1.565657,-0.929006,-0.482573,-0.036264,1.09539,0.980928,-0.589488


5. Remove all rows which have NA in either column 'b' or 'f'

In [424]:
df.dropna(subset=['b','f'])

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
8,0.188211,2.169461,-0.114928,2.003697,0.02961,0.795253,0.11811,-0.748532,0.58497,
9,-1.565657,-0.56254,-0.032664,-0.929006,-0.482573,-0.036264,1.09539,0.980928,-0.589488,


## Min-Yao
1. Import my RNA-Seq CPM data from 'Expression Browser_CPM_practice.xlsx' file. Please made the Itag number become the row index. How many genes in this data set?

In [425]:
dat = pd.ExcelFile('datasets/Expression Browser_CPM_practice.xlsx')
dat = pd.read_excel(dat, 'Expression Browser_CPM')
dat.head()

Unnamed: 0,Name,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
0,Solyc00g005000.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Solyc00g005005.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Solyc00g005040.3,0.0,0.136237,0.0,0.0,0.0,0.0,0.0,0.075741,0.3031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Solyc00g005050.3,49.947944,50.680249,42.857629,46.142383,40.925485,49.050587,49.010294,47.224546,42.28247,...,46.069768,39.626619,33.60176,38.84896,28.487619,42.516459,47.526986,47.021348,51.069134,44.480035
4,Solyc00g005055.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [426]:
dat = dat.set_index('Name')
dat.head()

Unnamed: 0_level_0,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,6_t3,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Solyc00g005000.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005005.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005040.3,0.0,0.136237,0.0,0.0,0.0,0.0,0.0,0.075741,0.3031,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Solyc00g005050.3,49.947944,50.680249,42.857629,46.142383,40.925485,49.050587,49.010294,47.224546,42.28247,50.970454,...,46.069768,39.626619,33.60176,38.84896,28.487619,42.516459,47.526986,47.021348,51.069134,44.480035
Solyc00g005055.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2. Please replace all 0 with NA. 

In [427]:
dat = dat.replace(0,NA)
dat.head()

Unnamed: 0_level_0,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,6_t3,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Solyc00g005000.3,,,,,,,,,,,...,,,,,,,,,,
Solyc00g005005.1,,,,,,,,,,,...,,,,,,,,,,
Solyc00g005040.3,,0.136237,,,,,,0.075741,0.3031,,...,,,,,,,,,,
Solyc00g005050.3,49.947944,50.680249,42.857629,46.142383,40.925485,49.050587,49.010294,47.224546,42.28247,50.970454,...,46.069768,39.626619,33.60176,38.84896,28.487619,42.516459,47.526986,47.021348,51.069134,44.480035
Solyc00g005055.1,,,,,,,,,,,...,,,,,,,,,,


3. We want to remove the genes that have no expression in all samples. How many genes left after we remove these genes.

In [428]:
dat = dat.dropna(how='all')
dat.head()

Unnamed: 0_level_0,6_c1,6_c2,6_c3,6_c4,6_c5,6_c6,6_c7,6_t1,6_t2,6_t3,...,3_t5,2_c1,2_c2,2_c3,2_c4,2_c5,2_t1,2_t2,2_t3,2_t4
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Solyc00g005040.3,,0.136237,,,,,,0.075741,0.3031,,...,,,,,,,,,,
Solyc00g005050.3,49.947944,50.680249,42.857629,46.142383,40.925485,49.050587,49.010294,47.224546,42.28247,50.970454,...,46.069768,39.626619,33.60176,38.84896,28.487619,42.516459,47.526986,47.021348,51.069134,44.480035
Solyc00g005080.2,,,,,,,,,,,...,0.305098,,,,,,,,,
Solyc00g005092.1,,,,,,,,,,,...,,,,,,,,,,0.426326
Solyc00g005094.1,0.376966,0.408712,,0.245438,,,0.414172,,0.15155,0.175156,...,0.152549,,0.143597,,,,,,,0.142109


## Julin

In Chapter 7.1 we learned how to replace a missing value in a Series with the mean value of that series.  Now we work with data frames.  Create a data frame like so:

```python
import pandas as pd
import numpy as np
from numpy import nan as NA

data = pd.DataFrame(np.random.randn(5,5))

data.iloc[3] = NA

data.iloc[2:, 3] = NA

data
```

In [429]:
data = pd.DataFrame(np.random.randn(5,5))
data.iloc[3] = NA
data.iloc[2:, 3] = NA
data

Unnamed: 0,0,1,2,3,4
0,-0.528735,0.457002,0.929969,-1.569271,-1.022487
1,-0.402827,0.220487,-0.193401,0.669158,-1.648985
2,-2.252797,-1.166832,0.353607,,-0.274569
3,,,,,
4,-1.224145,-1.80084,1.634736,,0.45794


1. For each column of the data frame that has missing data, replace that data with the mean values of the data in that column.

In [430]:
data.fillna(data.mean())

Unnamed: 0,0,1,2,3,4
0,-0.528735,0.457002,0.929969,-1.569271,-1.022487
1,-0.402827,0.220487,-0.193401,0.669158,-1.648985
2,-2.252797,-1.166832,0.353607,-0.450056,-0.274569
3,-1.102126,-0.572546,0.681228,-0.450056,-0.622025
4,-1.224145,-1.80084,1.634736,-0.450056,0.45794


2. Do the same thing, but only if there are less than 3 missing values in the column; otherwise leave the NAs there.

In [432]:
dataNAs = pd.Series(len(data) - data.count() < 3)
dataNAs

0     True
1     True
2     True
3    False
4     True
dtype: bool

In [433]:
data[dataNAs.index[dataNAs]] = data[dataNAs.index[dataNAs]].fillna(data.mean())
data

Unnamed: 0,0,1,2,3,4
0,-0.528735,0.457002,0.929969,-1.569271,-1.022487
1,-0.402827,0.220487,-0.193401,0.669158,-1.648985
2,-2.252797,-1.166832,0.353607,,-0.274569
3,-1.102126,-0.572546,0.681228,,-0.622025
4,-1.224145,-1.80084,1.634736,,0.45794


## Rie
1. Read CSV file named "soybean_miR.phythonClub.012519.csv" in my directory(https://github.com/UCD-pbio-rclub/python-data-analysis_RieU/blob/master/soybean_miR.phythonClub.012519.csv). "num" means the number of miRNA in the category that I am interested. I want to see the ratio of each miRNA relative to the total.

In [434]:
dat = pd.read_csv('https://raw.githubusercontent.com/UCD-pbio-rclub/python-data-analysis_RieU/master/soybean_miR.phythonClub.012519.csv')
print(dat.head())
print(dat.shape)

               miR  num  total
0     gma-miR10419    0      1
1     gma-miR10440    0      1
2     gma-miR1507a    0      7
3     gma-miR1507b    0      7
4  gma-miR1507c-3p    0      9
(176, 3)


2. Eliminated the rows containing 0 because I am not interested in those miRNAs!

In [435]:
dat2 = dat.loc[dat['num'] > 0]
print(dat2.head())
print(dat2.shape)

                          miR  num  total
5          gma-MIR1508a-5pNew    2      2
18  gma-miR156(a,h,u,v,w,x,y)    7     20
19         gma-miR156(b,f)-NV    8     24
20    gma-miR156(c,d,i,j,l,m)    7     23
21          gma-miR156(k,n,o)   11     22
(43, 3)


In [436]:
dat2['ratio'] = round(dat2['num']/dat2['total'],2)
print(dat2.head())
print(dat2.shape)

                          miR  num  total  ratio
5          gma-MIR1508a-5pNew    2      2   1.00
18  gma-miR156(a,h,u,v,w,x,y)    7     20   0.35
19         gma-miR156(b,f)-NV    8     24   0.33
20    gma-miR156(c,d,i,j,l,m)    7     23   0.30
21          gma-miR156(k,n,o)   11     22   0.50
(43, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Joel
Create a DataFrame with random numbers

`data = pd.DataFrame(np.random.randn(20),columns=["S1"])`

1. Bin data into quartiles (for bonus, label them according to their bin)

In [437]:
data = pd.DataFrame(np.random.randn(20),columns=['S1'])
data

Unnamed: 0,S1
0,0.555154
1,1.30672
2,-0.440554
3,-0.30135
4,0.498791
5,-0.823991
6,1.320566
7,0.507965
8,-0.653438
9,0.18698


In [438]:
pd.qcut(data['S1'],4)

0                   (0.0849, 0.575]
1                    (0.575, 1.321]
2     (-1.6789999999999998, -0.404]
3                  (-0.404, 0.0849]
4                   (0.0849, 0.575]
5     (-1.6789999999999998, -0.404]
6                    (0.575, 1.321]
7                   (0.0849, 0.575]
8     (-1.6789999999999998, -0.404]
9                   (0.0849, 0.575]
10                 (-0.404, 0.0849]
11                 (-0.404, 0.0849]
12                 (-0.404, 0.0849]
13                   (0.575, 1.321]
14                   (0.575, 1.321]
15    (-1.6789999999999998, -0.404]
16                   (0.575, 1.321]
17                 (-0.404, 0.0849]
18    (-1.6789999999999998, -0.404]
19                  (0.0849, 0.575]
Name: S1, dtype: category
Categories (4, interval[float64]): [(-1.6789999999999998, -0.404] < (-0.404, 0.0849] < (0.0849, 0.575] < (0.575, 1.321]]

In [439]:
pd.qcut(data['S1'],4, labels = ['25','50','75','100'])

0      75
1     100
2      25
3      50
4      75
5      25
6     100
7      75
8      25
9      75
10     50
11     50
12     50
13    100
14    100
15     25
16    100
17     50
18     25
19     75
Name: S1, dtype: category
Categories (4, object): [25 < 50 < 75 < 100]

2. Assign the corresponding bin label to the index of the data

In [440]:
bins = pd.qcut(data['S1'],4, labels = ['25','50','75','100'])
data.index = bins
data

Unnamed: 0_level_0,S1
S1,Unnamed: 1_level_1
75,0.555154
100,1.30672
25,-0.440554
50,-0.30135
75,0.498791
25,-0.823991
100,1.320566
75,0.507965
25,-0.653438
75,0.18698


3. Get the values for the first quartile in ascending order

In [441]:
data.sort_values('S1', ascending=True).loc['25']

Defaulting to column, but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,S1
S1,Unnamed: 1_level_1
25,-1.67779
25,-0.823991
25,-0.757177
25,-0.653438
25,-0.440554


# Ch. 7 pt. 2 HW Problem

## John
Find me Beer using this https://api.openbrewerydb.org/breweries?

1. Create a dataframe using the api address above with every brewery in the database. Will need to use page and per_page as parameters. per_page max is 50

In [442]:
import requests
url_base = 'https://api.openbrewerydb.org/breweries?per_page=50&page='
dat = pd.DataFrame(requests.get(url_base+'1').json())
page = 2
while True:
    new = requests.get(url_base+str(page))
    new = pd.DataFrame(new.json())
    if len(new) > 0:
        dat = dat.append(new)
        page += 1
    else:
        break
dat

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
0,micro,Birmingham,United States,2,33.524521,-86.774322,Avondale Brewing Co,2057775456,35222-1932,Alabama,201 41st St S,[],2018-08-23T23:19:57.825Z,http://www.avondalebrewing.com
1,micro,Tuscaloosa,United States,4,33.1984907123707,-87.5621551272424,Band of Brothers Brewing Company,2052665137,35401-4653,Alabama,1605 23rd Ave,[],2018-08-23T23:19:59.462Z,http://www.bandofbrosbrewing.com
2,micro,Birmingham,United States,44,33.5128492349817,-86.7914000624146,Trim Tab Brewing,2057030536,35233-3401,Alabama,2721 5th Ave S,[],2018-08-23T23:20:31.423Z,http://www.trimtabbrewing.com
3,micro,Huntsville,United States,46,34.7277523,-86.5932014,Yellowhammer Brewery,2569755950,35805-3046,Alabama,2600 Clinton Ave W,[],2018-08-23T23:20:33.102Z,http://www.yellowhammerbrewery.com
4,micro,Wasilla,United States,55,61.5752695,-149.4127103,Bearpaw River Brewing Co,,99654-7679,Alaska,4605 E Palmer Wasilla Hwy,[],2018-08-23T23:20:40.743Z,http://bearpawriverbrewing.com
5,micro,Anchorage,United States,76,61.1384893547315,-149.879076042937,King Street Brewing Co,9073365464,99515,Alaska,9050 King Street,[],2018-08-23T23:20:57.179Z,http://www.kingstreetbrewing.com
6,micro,Tucson,United States,94,32.2467372722906,-110.992750525872,1912 Brewing,5202564851,85745-1444,Arizona,2045 N Forbes Blvd Ste 105,[],2018-08-23T23:21:11.302Z,http://www.1912brewing.com
7,contract,Scottsdale,United States,98,33.4972615652174,-111.924474347826,Bad Water Brewing,5207459175,85251-3914,Arizona,4216 N Brown Ave,[],2018-08-23T23:21:15.169Z,http://www.badwaterbrewing.com
8,brewpub,Chandler,United States,104,33.3053455,-111.911126,BJs Restaurant & Brewery - Chandler,4809170631,85226-5175,Arizona,3155 W Chandler Blvd,[],2018-08-23T23:21:21.165Z,http://www.bjsrestaurants.com
9,micro,Tucson,United States,107,32.201608314954,-110.821778571134,BlackRock Brewers,5202073203,85710-6767,Arizona,1664 S Research Loop Ste 200,[],2018-08-23T23:21:23.794Z,http://www.brb.beer


2. Filter this data set down to only micro breweries in states that with begin and end with the same letter

In [443]:
pattern = r'^(.).*\1$'
regex = re.compile(r'^(.).*\1$', flags=re.IGNORECASE)
#data[dataNAs.index[dataNAs]]
dat = dat.loc[dat['brewery_type'] == 'micro']
dat = dat[dat['state'].str.contains(regex) == True]
dat

  """


Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
0,micro,Birmingham,United States,2,33.524521,-86.774322,Avondale Brewing Co,2057775456,35222-1932,Alabama,201 41st St S,[],2018-08-23T23:19:57.825Z,http://www.avondalebrewing.com
1,micro,Tuscaloosa,United States,4,33.1984907123707,-87.5621551272424,Band of Brothers Brewing Company,2052665137,35401-4653,Alabama,1605 23rd Ave,[],2018-08-23T23:19:59.462Z,http://www.bandofbrosbrewing.com
2,micro,Birmingham,United States,44,33.5128492349817,-86.7914000624146,Trim Tab Brewing,2057030536,35233-3401,Alabama,2721 5th Ave S,[],2018-08-23T23:20:31.423Z,http://www.trimtabbrewing.com
3,micro,Huntsville,United States,46,34.7277523,-86.5932014,Yellowhammer Brewery,2569755950,35805-3046,Alabama,2600 Clinton Ave W,[],2018-08-23T23:20:33.102Z,http://www.yellowhammerbrewery.com
4,micro,Wasilla,United States,55,61.5752695,-149.4127103,Bearpaw River Brewing Co,,99654-7679,Alaska,4605 E Palmer Wasilla Hwy,[],2018-08-23T23:20:40.743Z,http://bearpawriverbrewing.com
5,micro,Anchorage,United States,76,61.1384893547315,-149.879076042937,King Street Brewing Co,9073365464,99515,Alaska,9050 King Street,[],2018-08-23T23:20:57.179Z,http://www.kingstreetbrewing.com
6,micro,Tucson,United States,94,32.2467372722906,-110.992750525872,1912 Brewing,5202564851,85745-1444,Arizona,2045 N Forbes Blvd Ste 105,[],2018-08-23T23:21:11.302Z,http://www.1912brewing.com
9,micro,Tucson,United States,107,32.201608314954,-110.821778571134,BlackRock Brewers,5202073203,85710-6767,Arizona,1664 S Research Loop Ste 200,[],2018-08-23T23:21:23.794Z,http://www.brb.beer
10,micro,Tucson,United States,127,32.2504946147872,-111.005452051979,Dragoon Brewing Co,5203293606,85745-1214,Arizona,1859 W Grant Rd Ste 111,[],2018-08-23T23:21:40.563Z,http://www.dragoonbrewing.com
11,micro,Williams,United States,141,35.2500282,-112.1892168,Grand Canyon Brewing Company,8005132072,86046-2530,Arizona,233 W Route 66,[],2018-08-23T23:21:53.397Z,http://www.grandcanyonbrewingco.com


3. From the breweries found in part 2, find the farthest north, south, east, and west breweries. You may need to change the dtype of the columns
    

In [444]:
dat[['longitude', 'latitude']] = dat[['longitude', 'latitude']].apply(pd.to_numeric)
dat

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
0,micro,Birmingham,United States,2,33.524521,-86.774322,Avondale Brewing Co,2057775456,35222-1932,Alabama,201 41st St S,[],2018-08-23T23:19:57.825Z,http://www.avondalebrewing.com
1,micro,Tuscaloosa,United States,4,33.198491,-87.562155,Band of Brothers Brewing Company,2052665137,35401-4653,Alabama,1605 23rd Ave,[],2018-08-23T23:19:59.462Z,http://www.bandofbrosbrewing.com
2,micro,Birmingham,United States,44,33.512849,-86.791400,Trim Tab Brewing,2057030536,35233-3401,Alabama,2721 5th Ave S,[],2018-08-23T23:20:31.423Z,http://www.trimtabbrewing.com
3,micro,Huntsville,United States,46,34.727752,-86.593201,Yellowhammer Brewery,2569755950,35805-3046,Alabama,2600 Clinton Ave W,[],2018-08-23T23:20:33.102Z,http://www.yellowhammerbrewery.com
4,micro,Wasilla,United States,55,61.575269,-149.412710,Bearpaw River Brewing Co,,99654-7679,Alaska,4605 E Palmer Wasilla Hwy,[],2018-08-23T23:20:40.743Z,http://bearpawriverbrewing.com
5,micro,Anchorage,United States,76,61.138489,-149.879076,King Street Brewing Co,9073365464,99515,Alaska,9050 King Street,[],2018-08-23T23:20:57.179Z,http://www.kingstreetbrewing.com
6,micro,Tucson,United States,94,32.246737,-110.992751,1912 Brewing,5202564851,85745-1444,Arizona,2045 N Forbes Blvd Ste 105,[],2018-08-23T23:21:11.302Z,http://www.1912brewing.com
9,micro,Tucson,United States,107,32.201608,-110.821779,BlackRock Brewers,5202073203,85710-6767,Arizona,1664 S Research Loop Ste 200,[],2018-08-23T23:21:23.794Z,http://www.brb.beer
10,micro,Tucson,United States,127,32.250495,-111.005452,Dragoon Brewing Co,5203293606,85745-1214,Arizona,1859 W Grant Rd Ste 111,[],2018-08-23T23:21:40.563Z,http://www.dragoonbrewing.com
11,micro,Williams,United States,141,35.250028,-112.189217,Grand Canyon Brewing Company,8005132072,86046-2530,Arizona,233 W Route 66,[],2018-08-23T23:21:53.397Z,http://www.grandcanyonbrewingco.com


### North

In [445]:
dat.loc[dat['latitude'] == dat['latitude'].max()]

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
42,micro,Fox,United States,88,64.957086,-147.621976,Silver Gulch Brewing Co,9074522739,99712,Alaska,2195 Old Steese Highway,[],2018-08-23T23:21:07.021Z,http://www.silvergulch.com


### South

In [446]:
dat.loc[dat['latitude'] == dat['latitude'].min()]

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
18,micro,Gulf Shores,United States,6,30.278051,-87.683039,Big Beach Brewing Company,2519482337,36542-3104,Alabama,300 E 24th Ave,[],2018-08-23T23:20:01.170Z,http://www.bigbeachbrewing.com


### East

In [447]:
dat.loc[dat['longitude'] == dat['longitude'].max()]

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
6,micro,Columbiana,United States,5355,40.888487,-80.69353,BirdFish Brewing Co,3303339385,44408-1348,Ohio,16 S Main St,[],2018-08-24T15:43:26.570Z,http://www.birdfishbrew.com


### West

In [448]:
dat.loc[dat['longitude'] == dat['longitude'].min()]

Unnamed: 0,brewery_type,city,country,id,latitude,longitude,name,phone,postal_code,state,street,tag_list,updated_at,website_url
36,micro,Kodiak,United States,78,57.790083,-152.407155,"Kodiak Island Brewing Co, LLC",9074862537,99615-6580,Alaska,117 Lower Mill Bay Rd,[],2018-08-23T23:20:58.860Z,http://www.kodiakbrewery.com
