In [1]:
 """
 ------> axis = 1
 |
 |
 |
 v
axis = 0
""";

## Reindexing

In [2]:
import pandas as pd
import numpy as np

In [3]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [4]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [5]:
obj2 = obj.reindex("a b c d e".split())

In [6]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

For ordered data like time series, it may be desirable to do some interpolation or filling
of values when reindexing. The method option allows us to do this, using a
method such as **ffill**, which forward-fills the values:

In [7]:
obj3 = pd.Series(data = "blue purple yellow".split(),index = [0,2,4])

In [8]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [9]:
obj3.reindex(range(6),method = "ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [10]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),index="a c d".split(), columns = "Ohio Texas California".split())

In [11]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [12]:
frame.reindex(["a","b","c","d"])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [13]:
# The columns can be reindexed with the columns keyword:

In [14]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [15]:
states = ['Texas', 'Utah', 'California']

In [16]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [17]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


## Dropping Entries from an Axis

In [18]:
obj = pd.Series(np.arange(5), index ="a b c d e".split())

In [19]:
obj

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [20]:
new_obj = obj.drop("c")

In [21]:
new_obj

a    0
b    1
d    3
e    4
dtype: int32

In [22]:
obj

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [23]:
obj.drop(["d","c"])

a    0
b    1
e    4
dtype: int32

In [24]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns=['one', 'two', 'three', 'four'])

In [25]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [26]:
data.drop(["Colorado","Ohio"])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [27]:
data.drop("two",axis = 1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


Many functions, like drop, which modify the size or shape of a Series or DataFrame,
can manipulate an object **in-place** without returning a new object

In [28]:
obj

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [29]:
obj.drop("c", inplace = True)

In [30]:
obj

a    0
b    1
d    3
e    4
dtype: int32

# Indexing, Selection and Filtering

**df[val]** Select single column or sequence of columns from the DataFrame; special case 
conveniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame
(set values based on some criterion)<br>
**df.loc[val]** Selects single row or subset of rows from the DataFrame by label <br>
**df.loc[:, val]** Selects single column or subset of columns by label <br>
**df.loc[val1, val2]** Select both rows and columns by label <br>
**df.iloc[where]** Selects single row or subset of rows from the DataFrame by integer position <br>

**df.iloc[:, where]** Selects single column or subset of columns by integer position <br>
**df.iloc[where_i, where_j]** Select both rows and columns by integer position <br>
**df.at[label_i, label_j]** Select a single scalar value by row and column label <br>
**df.iat[i, j]** Select a single scalar value by row and column position (integers)<br>
**reindex method** Select either rows or columns by labels <br>
**get_value, set_value** methods Select single value by row and column label 

### Series

In [31]:
obj = pd.Series(np.arange(4), index = "a b c d".split())

In [32]:
obj

a    0
b    1
c    2
d    3
dtype: int32

In [33]:
obj["b"]

1

In [34]:
obj[1] # Like a list

1

In [35]:
obj[[1,3]]

b    1
d    3
dtype: int32

In [36]:
obj[obj < 2]

a    0
b    1
dtype: int32

In [37]:
obj["b":"c"] = 100

In [38]:
obj

a      0
b    100
c    100
d      3
dtype: int32

## DataFrame

In [39]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),index = "Ohio Colorado Utah New_York".split(),columns=['one', 'two', 'three', 'four'])

In [40]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New_York,12,13,14,15


In [41]:
data["two"]

Ohio         1
Colorado     5
Utah         9
New_York    13
Name: two, dtype: int32

In [42]:
data[["three","one"]]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New_York,14,12


In [43]:
data[:2]  # Slicing work with rows

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [44]:
data["three"] > 5

Ohio        False
Colorado     True
Utah         True
New_York     True
Name: three, dtype: bool

In [45]:
data[data["three"] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New_York,12,13,14,15


In [46]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New_York,False,False,False,False


In [47]:
data[data<5] = 0 

In [48]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New_York,12,13,14,15


## Selection with loc and iloc

In [49]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New_York,12,13,14,15


In [50]:
# Important

In [51]:
data.loc["Colorado",["two","three"]] 

two      5
three    6
Name: Colorado, dtype: int32

In [52]:
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [53]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [54]:
data.iloc[[1,2],[3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [55]:
data.iloc[:,:3][data["three"] > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New_York,12,13,14


### Integer Indexes

In [56]:
ser = pd.Series(np.arange(3))

In [57]:
ser

0    0
1    1
2    2
dtype: int32

In [58]:
## !!! ser[-1] is error with numerical index since if there exists index -1 ?

In [59]:
## On the other hand, with a non-integer index, there is no potential for ambiguity:

In [60]:
ser2 = pd.Series(np.arange(3),index = ["a","b","c"])

In [61]:
ser2

a    0
b    1
c    2
dtype: int32

In [62]:
ser2[-1]

2

To keep things consistent, if you have an axis index containing integers, data selection
will always be label-oriented. For more precise handling, use loc (for labels) or iloc
(for integers):

In [63]:
ser[:1]

0    0
dtype: int32

In [64]:
ser.loc[:1]

0    0
1    1
dtype: int32

In [65]:
ser.iloc[:1]

0    0
dtype: int32

## Arithmetic and Data Alignment

In [66]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5],
               index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],
              index = ["a","c","e","f","g"])

In [67]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [68]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [69]:
s1+s2 # The internal data alignment introduces missing values 
      # in the label locations that don’t overlap.

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [70]:
df1 = pd.DataFrame(np.arange(9).reshape((3,3)), columns = list("bcd"), index = ["Ohio", "Texas", "Colorado"])

In [71]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [72]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


Since the 'c' and 'e' columns are not found in both DataFrame objects, they appear
as all missing in the result. The same holds for the rows whose labels are not common
to both objects.

#### Arithmetic methods with fill values

In [73]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
    columns=list('abcd'))

In [74]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
    columns=list('abcde'))

In [75]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [76]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [77]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [78]:
# fill values

In [79]:
df1.add(df2,fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


#### Operations between DataFrame and Series

In [80]:
arr = np.arange(12).reshape((3,4))

In [81]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [82]:
arr[0]

array([0, 1, 2, 3])

In [83]:
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [84]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
                    columns = list("bde"),
                    index = ["Utah", "Ohio", "Texas", "Oregon"])

In [85]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [86]:
series = frame.iloc[0]

In [87]:
series

b    0
d    1
e    2
Name: Utah, dtype: int32

In [88]:
frame - series

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [89]:
series2 = pd.Series(range(3), index = list("bef"))

In [90]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [91]:
series2

b    0
e    1
f    2
dtype: int64

In [92]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [93]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [94]:
series3 = frame["d"]

In [95]:
series3

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int32

In [96]:
frame - series3

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [97]:
# But if we do with df.sub(...,axis = )

In [98]:
frame.sub(series3,axis=0)

Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1


In [99]:
frame.sub(series3,axis=1)

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [100]:
# add or subs columnwise normally

## Function Application and Mapping

In [101]:
frame = pd.DataFrame(np.random.randn(4,3), 
                     columns = list("bde"),
                     index = ["Utah","Ohio","Texas","Oregon"])

In [102]:
frame

Unnamed: 0,b,d,e
Utah,-0.669752,-0.755303,-0.839172
Ohio,-0.721374,-0.135876,1.031913
Texas,2.017322,0.299521,1.42449
Oregon,-0.349637,-0.130271,-0.858596


#### np.abs(frame)

In [103]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.669752,0.755303,0.839172
Ohio,0.721374,0.135876,1.031913
Texas,2.017322,0.299521,1.42449
Oregon,0.349637,0.130271,0.858596


In [104]:
frame

Unnamed: 0,b,d,e
Utah,-0.669752,-0.755303,-0.839172
Ohio,-0.721374,-0.135876,1.031913
Texas,2.017322,0.299521,1.42449
Oregon,-0.349637,-0.130271,-0.858596


#### frame.apply(function)

In [105]:
frame.max().max()

2.0173215531952016

In [106]:
frame.max()

b    2.017322
d    0.299521
e    1.424490
dtype: float64

In [107]:
frame.apply(lambda x: x.max() - x.min())

b    2.738695
d    1.054823
e    2.283086
dtype: float64

In [108]:
frame.max(axis = "columns")

Utah     -0.669752
Ohio      1.031913
Texas     2.017322
Oregon   -0.130271
dtype: float64

In [109]:
frame.apply(lambda x: x.max()-x.min(),axis = "columns")

Utah      0.169420
Ohio      1.753286
Texas     1.717801
Oregon    0.728325
dtype: float64

In [110]:
frame

Unnamed: 0,b,d,e
Utah,-0.669752,-0.755303,-0.839172
Ohio,-0.721374,-0.135876,1.031913
Texas,2.017322,0.299521,1.42449
Oregon,-0.349637,-0.130271,-0.858596


In [111]:
# Perfect

In [112]:
frame.apply(lambda x: pd.Series([x.min(),x.max()],index = ["min","max"]))

Unnamed: 0,b,d,e
min,-0.721374,-0.755303,-0.858596
max,2.017322,0.299521,1.42449


In [146]:
frame = pd.DataFrame(np.random.randn(4,3), 
                     columns = list("bde"),
                     index = ["Utah","Ohio","Texas","Oregon"])

In [147]:
frame

Unnamed: 0,b,d,e
Utah,-0.269109,-0.627002,-1.857306
Ohio,-0.642211,1.488526,0.996939
Texas,-0.830369,-0.757233,0.798762
Oregon,0.704448,0.101086,1.818861


In [148]:
frame.applymap(lambda x : "%.2f" %x) # element-wise for dataframe

Unnamed: 0,b,d,e
Utah,-0.27,-0.63,-1.86
Ohio,-0.64,1.49,1.0
Texas,-0.83,-0.76,0.8
Oregon,0.7,0.1,1.82


In [149]:
# apply ---> Work on row or column

In [150]:
# applymap ---> Element wise

In [151]:
# map ---> Series

## Sorting and Ranking

#### df.sort_index()

In [113]:
obj = pd.Series(range(4), index = list("dabc"))

In [114]:
obj

d    0
a    1
b    2
c    3
dtype: int64

In [115]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [116]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)),
                    index = ["three","one"],
                    columns = ["d","a","b","c"])

In [117]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [118]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [119]:
frame.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [120]:
frame.sort_index(axis = 1, ascending = False )

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


#### df.sort_values()

In [121]:
obj =pd.Series([4, 7, -3, 2])

In [122]:
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [123]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [124]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [125]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [126]:
frame.sort_values(by = "b")

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [127]:
frame.sort_values(by = "a")

Unnamed: 0,b,a
0,4,0
2,-3,0
1,7,1
3,2,1


In [128]:
frame.sort_values(by = "b", ascending = False)

Unnamed: 0,b,a
1,7,1
0,4,0
3,2,1
2,-3,0


In [129]:
frame.sort_values(by = ["a","b"])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


#### rank()  Sıralama

Ranking assigns ranks from one through the number of valid data points in an array.
The rank methods for Series and DataFrame are the place to look; by default rank
breaks ties by assigning each group the mean rank

In [130]:
obj = pd.Series([7,-5,7,4,2,0,4])

In [131]:
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [132]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [133]:
# Method of Rank

**'average'** Default: assign the average rank to each entry in the equal group <br>
**'min'** Use the minimum rank for the whole group <br>
**'max'** Use the maximum rank for the whole group <br>
**'first'** Assign ranks in the order the values appear in the data <br>
**'dense'** Like method='min', but ranks always increase by 1 in between groups rather than the number of equal
elements in a group <br>

In [134]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [135]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
    'c': [-2, 5, 8, -2.5]})

In [136]:
frame.sort_index(axis = 1,inplace = True)

In [137]:
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [138]:
frame.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


## Axis Indexes with Duplicate Labels

In [139]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [140]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [141]:
obj["a"]

a    0
a    1
dtype: int64

In [142]:
obj.index.is_unique

False

In [143]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [144]:
df

Unnamed: 0,0,1,2
a,0.52393,-0.482178,0.707551
a,-0.027988,0.183549,-1.050302
b,-1.464878,0.374412,-0.696223
b,-1.303087,-0.169837,-0.465133
