##### homepage: http://pandas.pydata.org/index.html

# Indexing and Selecting Data

In [7]:
# https://pandas.pydata.org/pandas-docs/stable/indexing.html
# https://www.oreilly.com/learning/introducing-pandas-objects

# Creating DataFrame

In [8]:
import pandas as pd
import warnings
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# df1 = pd.read_csv("./trainset/pcdata_0611.csv") 
# read_sql read_html

### read_csv

In [9]:
# spcify type for column1
# df1 = pd.read_csv("./trainset/pcdata_0611.csv", {"column1": str}) 

In [10]:
# specify dtype
# pd.read_csv("tprice_and_score.csv", index_col=0, dtype={"ticker_symbol": str})


### Specify values for each column.

In [11]:
pd.DataFrame({'A': [1, 2, 3],
              'B': [4, 5, 6]},  
                    index=list('abc'))

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [12]:
d = pd.DataFrame({'A': [1, 2, 3],
              'B': [4, 5, 6]},  
                    index=list('abc'))

In [13]:
d['B']['a']

4

### Specify values for each row.

In [14]:
pd.DataFrame([[1, 2, 3],
              [4, 5, 6]],
                    index=list('ab'), columns=['a','b','c'])

Unnamed: 0,a,b,c
a,1,2,3
b,4,5,6


In [15]:
# dfd = pd.DataFrame({'A': [1, 2, 3],
#                     'B': [4, 5, 6]},  
#                     index=list('abc'))
# dfd

### https://stackoverflow.com/questions/38917945/how-indexing-works-in-pandas

### read from local file

In [16]:
style = pd.read_csv('data_morning_start_style.csv')
style.head()

Unnamed: 0,SECURITY_ID_INT,raw_X,raw_Y,vertical,horizontal
0,1734,-106.51273,223.595615,large,value
1,1351,-95.753155,244.328818,large,value
2,2228,-74.231371,318.054976,large,value
3,2296,-57.797931,394.503697,large,value
4,1971,-50.873616,581.97042,large,value


### from dict

In [17]:
pd.DataFrame.from_dict({'A': [1, 2, 3],
                        'B': [4, 5, 6]})

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


# Growth

### .append

In [18]:
df = pd.DataFrame(columns=['taskID', 'datayes_tables', 'solid_tables', 'datayes_imgs', 'solid_imgs'])

In [19]:
df = df.append({'taskID': 'XXX', 'datayes_tables': 1, 'solid_tables': 2,
                'datayes_imgs': 3, 'solid_imgs': 4}, ignore_index=True)
df

Unnamed: 0,taskID,datayes_tables,solid_tables,datayes_imgs,solid_imgs
0,XXX,1,2,3,4


# Reshaping Data

### .melt

In [20]:
dfd = pd.DataFrame({'A': [1, 2, 3],
              'B': [4, 5, 6]},  
                    index=list('abc'))
dfd

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [21]:
pd.melt(dfd)

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6


In [22]:
dfd = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6],
                    'C': [7, 8, 9]
                   }, 
                    index=list('abc'))
dfd

Unnamed: 0,A,B,C
a,1,4,7
b,2,5,8
c,3,6,9


In [23]:
pd.melt(dfd)

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


### .concat
more efficient than append
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html?highlight=concat#pandas.concat

In [24]:
pd.concat([dfd, dfd])

Unnamed: 0,A,B,C
a,1,4,7
b,2,5,8
c,3,6,9
a,1,4,7
b,2,5,8
c,3,6,9


In [25]:
# from pandas import MultiIndex
# arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
# MultiIndex.from_arrays(arrays, names=('number', 'color'))

In [26]:
pd.concat([dfd, dfd], ignore_index=True)

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9
3,1,4,7
4,2,5,8
5,3,6,9


##### axis : {0/’index’, 1/’columns’}, default 0

    The axis to concatenate along


In [27]:
pd.concat([dfd, dfd], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
a,1,4,7,1,4,7
b,2,5,8,2,5,8
c,3,6,9,3,6,9


In [29]:
pd.concat([dfd, dfd], sort=False) 

TypeError: concat() got an unexpected keyword argument 'sort'

### .pivot 

### .sort_values

In [None]:
dfd.sort_values("A")

In [None]:
dfd.sort_values("A", ascending=False)

### .rename

In [None]:
dfd.rename(columns={"A":"new_name"})
dfd

In [None]:
dfd.rename(columns={"A":"new_name"}, inplace=True)
dfd

### .sort_index

In [None]:
dfd.sort_index()

### .reset_index
reset index of DataFrame to row numbers, moving index to columns.

In [None]:
dfd.reset_index()

### .drop

In [None]:
# 按列名删除列
dfd.drop(['A'], axis=1)

### .T
Transpose 

In [None]:
dfd.T

# Index
https://stackoverflow.com/questions/27238066/what-is-the-point-of-indexing-in-pandas

In [None]:
dfd[["A","B"]]

In [None]:
dfd[1:2]

In [None]:
dfd[::2]

### new column

In [None]:
tdf = dfd
tdf["C"] = tdf["A"] + tdf["B"]
tdf

In [None]:
tdf.loc[:,'d'] = tdf["C"]
tdf

### new row

In [None]:
tdf.append(pd.DataFrame([[2,4,6]], columns=["A","B","C"]))

In [None]:
# tdf2.reset_index()

In [None]:
tdf.append(tdf)

# Subset Observations

In [None]:
dfd

In [None]:
dfd[dfd.A>1]

In [None]:
dfd.A>1

In [None]:
filter = dfd.A>1
type(filter)

In [None]:
# dfd[dfd.C > dfd.A] 

### .isin

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
df

In [None]:
df.isin({"A": [1,3,5]})

In [None]:
df.A.isin([1,3,5])

In [None]:
df[df.isin({"A": [1,3,5]})]

In [None]:
df[df.A.isin([1,3,5])]

In [None]:
df[df.isin({"A": [1,3,5], "B": ['b']})]

In [None]:
# dates = pd.date_range('1/1/2000', periods=8)

# df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
# df

### .loc
df.loc[row_indexer,column_indexer]

In [None]:
dates = pd.date_range('1/1/2000', periods=8)

df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

In [None]:
# df.loc[:]

In [None]:
df[df.columns[3]] # Series, 用列索引取整列数据

In [None]:
# note the difference between with and without the brackets
df1 = df.loc[:, ['A']]
print type(df1)
df2 = df.loc[:, 'A']
print type(df2)

In [None]:
# 行索引的值需要是真实的索引值， 有可能不是0，1，2，3，本例中，索引值为日期
df.loc['2000-01-08', ['A']]

In [None]:
# 行索引的值需要是真实的索引值， 有可能不是0，1，2，3，本例中，索引值为日期
df.loc['2000-01-08']

In [None]:
# 行索引的值需要是真实的索引值， 有可能不是0，1，2，3，本例中，索引值为日期
df.loc['2000-01-08',['A','B']]

In [None]:
df.loc[:, ["A"]]

In [None]:
df.loc[:, ["A"]]

##### String likes in slicing can be convertible to the type of the index and lead to natural slicing.

In [None]:
 df.loc['20000101':'20000105', :]

In [None]:
# df.loc[:3]

### .iloc

In [None]:
df.iloc[:3]

In [None]:
df.iloc[:3, 2:4]

### .index

In [None]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
   'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
pd.DataFrame(d)

In [None]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
   'two' : pd.Series([1., 2., 3., 4.], index=['e', 'f', 'g', 'h'])}
pd.DataFrame(d)

In [None]:
df.set_index("A").index

In [None]:
df.index

In [None]:
df.index[[0, 2]]

In [None]:
df.loc[df.index[[0, 2]],['A']]

In [None]:
# 条件索引
df[df.A > 0]

In [None]:
filters = df.index>'2000-01-05'
filters
df.loc[filters, :]

In [None]:
# bad case
# df.loc[[0,2],['A']] 
df.iloc[filters, :]

In [None]:
# df.index>'2000-01-05'
type(df.index>'2000-01-05')

### .get_loc

In [None]:
df.columns.get_loc('A')

In [None]:
df.iloc[[0, 2], [df.columns.get_loc('A')]]

### .get_indexer
For getting multiple indexers

In [None]:
df.columns.get_indexer(['A', 'B'])

In [None]:
df.iloc[[0, 2], df.columns.get_indexer(['A', 'B'])]

### .loc vs .iloc
https://stackoverflow.com/questions/31593201/pandas-iloc-vs-ix-vs-loc-explanation

In [None]:
df2 = df.reset_index(drop=True)
df2 = df2.sort_values("A")
df2

In [None]:
df2.loc[:3]

In [None]:
# df2.loc[:1, ['A','C']]

In [None]:
df2.iloc[:3]

In [None]:
# df2.iloc[:1, :3]

### .ix 
starting from Pandas 0.20.1 the .ix indexer is deprecated, in favor of the more strict .iloc and .loc indexers.

### .ix vs .iloc vs .loc
https://stackoverflow.com/questions/31593201/pandas-iloc-vs-ix-vs-loc-explanation-how-are-they-different

### Different Choices for Indexing
http://pandas.pydata.org/pandas-docs/stable/indexing.html#different-choices-for-indexing

##### what else?

In [None]:
df2.loc[:3]

In [None]:
df2.iloc[:3]

# Summarize Data

In [None]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

### .value_counts

In [None]:
df["A"].value_counts()

### len

In [None]:
len(df)

### .nunique

In [None]:
df["B"].nunique()

### .unique

In [None]:
df.B.unique()

### .describe

In [None]:
df.describe()

### .sum

In [None]:
df.sum()

In [None]:
df.A.sum()

### .count

In [None]:
df.count()

In [None]:
df.A.count()

### .median

In [None]:
df.median()

In [None]:
df.A.median()

### .quantile

In [None]:
df.quantile([0.25,0.75]) 

### .apply

In [None]:
def add_one(input):
    return input + 1
df.apply(add_one)

In [None]:
df

In [None]:
df.min()

In [None]:
df.max()

In [None]:
df.mean()

In [None]:
df.var() # variance of each object

In [None]:
df.std() # standard deviation of each object

# Group Data

### .groupby

In [None]:
import pandas as pd
style = pd.read_csv('data_morning_start_style.csv')
style.info()

In [None]:
style.head()

In [None]:
# to calculate the mean of vertical group
dfgroup = style[['raw_X','raw_Y','vertical']].groupby(['vertical']).agg(['mean', 'std', 'sum', 'count'])
# dfgroup = style.groupby(['vertical']).agg(['mean', 'std'])

dfgroup

In [None]:
dfgroup.columns

In [None]:
dg = style[['raw_X','raw_Y','vertical']].groupby(['vertical'])
dg.filter(lambda x: len(x)>1000, dropna=True)

### .droplevel

In [None]:
dfgroup.columns.droplevel(0)
# dfgroup.columns.droplevel(1)

In [None]:
tdf = dfgroup
tdf.columns = dfgroup.columns.droplevel(0)
tdf

# Handling Missing Data

### .dropna()

In [None]:
# to make some NA 
tdf = style.head()

In [None]:
tdf.iloc[0, 0] = np.nan

In [None]:
tdf.iloc[0, 1] = np.nan
tdf.iloc[0, 2] = np.nan
tdf.iloc[1, 1] = np.nan
tdf.iloc[2, 2] = np.nan

In [None]:
tdf.head()

In [None]:
tdf.dropna()

In [None]:
tdf.dropna(subset=['raw_X', 'raw_Y'] , how="all") # 

In [None]:
last_df.dropna(subset=['id'])

### .fillna

In [None]:
tdf.fillna(0)

# Combine Data Sets

### .merge

In [None]:
dfa = pd.DataFrame({'key': ["foo","bar","baz"],
                    'v1': [4, 5, 6]})
dfb = pd.DataFrame({'key': ["foo","bar","qux"],
                    'v2': [1, 2, 3]})

dfm = pd.DataFrame({'key': ["fff"],
                    'v2': [11]})

In [None]:
dfa

In [None]:
dfb

In [None]:
df = dfa.merge(dfb, on='key', how='right')
df
# df[df.v1 == np.nan]
df.loc[df.index[df.v1.isnull()]]
# df.v1.isnull()
# df.index[(df.T == np.nan)

In [None]:
dfa.merge(dfb, on="key")

In [None]:
dfa.merge(dfb[["key"]], on="key")

In [None]:
dfa.merge(dfm, on="key").shape

In [None]:
pd.merge(dfa,dfb)

##### _merge
indicator

##### Q: How to get rows that appear in dfa but not dfb ? 

In [None]:
pd.merge(dfa, dfb, how='outer',indicator=True)

# Method Chaining

In [None]:
pd.merge(dfa, dfb, how='outer',indicator=True).query('_merge=="left_only"').drop("_merge", axis=1)

# Example

### .read_csv
fund style 20171229 

In [None]:
style = pd.read_csv('20171229.csv')
style.info()

In [None]:
style.head(5)

In [None]:
style.vertical.unique()

In [None]:
style.horizontal.unique()

### .sort_values
ascending : default True
inplace : bool, default False

    if True, perform operation in-place


In [None]:
style.sort_values("SECURITY_ID_INT", inplace=True)
style.head(10)

In [None]:
style.tail()

In [None]:
.sort_index()

In [None]:
style.sort_index().head()

In [None]:
# style.head()

### .reset_index

In [None]:
# df

In [None]:
df.reset_index()

##### Q: how to aviod old index being added as a column?

In [None]:
# df.reset_index()

In [None]:
df2 = df.reset_index(drop=True)
df2

### .set_index
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.set_index.html?highlight=set_index

In [None]:
tdf = df.reset_index()
tdf.set_index("index")

In [None]:
tdf = df.reset_index()
tdf.set_index("A")

In [None]:
# style.reset_index(inplace=True)
style.reset_index(inplace=True, drop=True )

style.head()
# style

### .index

In [None]:
filter = style[style.horizontal=='value'].index
# .indexs()
filter
# type(filter)

### .drop
axis : {0 or ‘index’, 1 or ‘columns’}, default 0
Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’).
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html?highlight=drop#pandas.DataFrame.drop

In [None]:
style.drop(filter).head()

In [None]:
# 返回值为None
style.drop(filter, inplace=True)

In [None]:
style.info() 

In [None]:
# style[style.horizontal=='value']
# style[style.horizontal=='value']

In [None]:
style.head(5)

# reshape
https://pandas.pydata.org/pandas-docs/stable/reshaping.html?highlight=reshape

In [None]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                    'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                    'baz': [1, 2, 3, 4, 5, 6],
                    'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
df

### .melt

In [None]:
pd.melt(df)

### .stack

In [None]:
df.stack()
# type(df.stack())

In [None]:
df.unstack()
# type(df.stack())

In [None]:
df.groupby("bar")['foo'].value_counts()

In [None]:
df.groupby("bar")['foo'].value_counts().unstack()

### .pivot
Return reshaped DataFrame organized by given index / column values

In [None]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                    'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                    'baz': [1, 2, 3, 4, 5, 6],
                    'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
df

In [None]:
df.pivot(index='foo', columns='bar', values='baz')

In [None]:
df.pivot(index='foo', columns='bar')

In [None]:
style.head()

In [None]:
tdf = style[['raw_X','vertical', "horizontal"]].groupby(['vertical', "horizontal"]).agg(['mean', 'std'])
tdf

In [None]:
tdf.columns = tdf.columns.droplevel(0)
tdf

In [None]:
tdf = tdf.reset_index()
tdf

In [None]:
tdf.pivot(index="vertical", columns="horizontal")

# Challenge

### .shift
Shift index by desired number of periods with an optional time freq

In [None]:
tdf = style[:5]
tdf

In [None]:
tdf.raw_X = tdf.raw_X.shift(1)
tdf

### .idxmax

In [None]:
df = pd.DataFrame({'A': [7, 2, 3],
              'B': [4, 5, 6]},  
                    index=list('abc'))

In [None]:
df.head()

In [None]:
df["max"] = df[["A", "B"]].idxmax(axis=1)

In [None]:
df.head()

In [None]:
.cumsum

In [None]:
df['cumsum']=df.groupby("bar")["baz"].cumsum()
df

In [None]:
from pandas_datareader import data, wb
# https://pandas-datareader.readthedocs.io/en/latest/
# DAX = web.DataReader(name='^GDAXI', data_source='yahoo', start='2017-1-1')

In [None]:
import pandas_datareader as pdr
# pdr.get_data_fred('GS10')

In [None]:
# import pandas_datareader.data as web
# import datetime
# start = datetime.datetime(2017, 1, 1)
# end = datetime.datetime(2017, 1, 27)
# f = web.DataReader('F', 'morningstar', start, end)
# f

In [None]:
from pandas_datareader import data, wb
import numpy as np

from datetime import datetime

# start = datetime.datetime(2010, 1, 1)
# end = datetime.datetime(2017, 1, 1)
# df = data.get_data_yahoo('MS', start, end)

# type(datetime.today().year)
# for y in reversed(range(2003, 2020)):
#     print y

dates = pd.date_range('1/1/2000', periods=8)
dates
# df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
# df


# pd.date_range(start='2019-01-01', end='2019-03-01')


In [None]:
import numpy as np

In [None]:
df.head()

### compare two columns

In [None]:
df["equal"] = np.where((df.High == df.Low), True, False)
df.head()

# .to_csv

In [None]:
df.to_csv("qa_err_items.csv", encoding='utf-8')

# numpy 

In [None]:
a_np = np.arange(10000)
# a_py = range(a)

In [None]:
a_np

In [None]:
import numexpr as ne

In [None]:
def f6(a):
    ex = 'abs(cos(a))**0.5 + sin(2+3*a)'
    ne.set_num_threads(4)
    return ne.evaluate(ex)

In [None]:
r = f6(a_np)

In [None]:
r

In [None]:
to_numeric()
astype()
https://stackoverflow.com/questions/15891038/change-data-type-of-columns-in-pandas/16134561
    

In [None]:


    When introducing NAs into an existing Series or DataFrame via reindex or some other means, boolean and integer types will be promoted to a different dtype in order to store the NAs. These are summarized by this table:

Typeclass   Promotion dtype for storing NAs
floating    no change
object      no change
integer     cast to float64
boolean     cast to object