# Pandas Questions

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

#### How to convert the index of a series into a column of a dataframe?

In [4]:
pd.DataFrame(columns=ser.index)

Unnamed: 0,a,b,c,e,d,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z


#### Using List operator for strings

In [5]:
['abc']

['abc']

In [6]:
list('abc')

['a', 'b', 'c']

#### How to create lags and leads of a column in a dataframe?

In [7]:
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4), columns = list('abcd'))

In [8]:
df

Unnamed: 0,a,b,c,d
0,63,35,49,76
1,46,53,35,53
2,83,43,53,88
3,78,63,76,83
4,72,50,96,60


In [9]:
df.a.shift(1)

0     NaN
1    63.0
2    46.0
3    83.0
4    78.0
Name: a, dtype: float64

#### How to get the n’th largest value of a column when grouped by another column?

In [10]:
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

In [11]:
df

Unnamed: 0,fruit,rating,price
0,apple,0.932899,2
1,banana,0.454045,3
2,orange,0.573506,10
3,apple,0.642715,4
4,banana,0.343851,6
5,orange,0.039015,0
6,apple,0.513154,1
7,banana,0.989769,6
8,orange,0.938055,13


In [13]:
df[df.groupby('fruit',).cumcount() == 1]

Unnamed: 0,fruit,rating,price
3,apple,0.642715,4
4,banana,0.343851,6
5,orange,0.039015,0


#### How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [20]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

In [21]:
ser.value_counts()[:2]

4    5
1    4
dtype: int64

In [185]:
np.where(ser.isin(ser.value_counts().index[:2].tolist()),ser,"other")

array(['3', '1', 'other', '1', '3', '3', '1', '3', 'other', '1', 'other',
       'other'], dtype='<U21')

#### How to find the positions of numbers that are multiples of 3 from a series?

In [22]:
ser = pd.Series(np.random.randint(1, 10, 7))
ser

0    2
1    3
2    5
3    1
4    6
5    6
6    1
dtype: int64

In [27]:
np.where(ser%3 == 0)

(array([5, 6]),)

In [26]:
np.argwhere(ser % 3==0)

array([[5],
       [6]])

#### How to reshape a dataframe

- Reshape only works for numpy array

In [7]:
df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10,-1))
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,10,66,1,77,34,76,7,6,23,35
1,47,62,52,79,71,90,11,97,87,74


In [8]:
pd.DataFrame(df.values.reshape(5,20))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,10,66,1,77,34,76,7,6,23,35,47,62,52,79,71,90,11,97,87,74
1,82,76,56,8,43,67,14,93,53,41,77,17,81,17,86,56,86,96,6,24
2,7,21,54,79,22,6,80,66,11,43,19,50,11,34,29,62,80,26,63,16
3,93,59,64,90,56,1,44,79,53,52,96,41,14,62,87,53,50,20,75,12
4,44,12,73,74,99,23,68,52,13,15,4,85,79,93,27,33,43,7,35,6


#### How to replace both the diagonals of dataframe with 0

In [34]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,14,24,45,47,53,12,50,87,25,56
1,4,9,44,32,60,95,7,3,53,25
2,75,98,62,95,13,64,3,82,59,34
3,41,24,85,79,43,18,67,50,89,68
4,93,58,40,49,17,3,23,6,65,54
5,70,43,4,18,75,11,2,92,91,22
6,89,38,37,80,1,64,1,8,82,48
7,80,25,92,75,98,6,90,38,38,96
8,9,26,80,25,96,54,57,80,16,3
9,44,39,30,90,23,97,3,63,44,63


In [35]:
for x,y in list(zip(df.index,df.columns)):
    df.iloc[x,y] = 0
    
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,24,45,47,53,12,50,87,25,56
1,4,0,44,32,60,95,7,3,53,25
2,75,98,0,95,13,64,3,82,59,34
3,41,24,85,0,43,18,67,50,89,68
4,93,58,40,49,0,3,23,6,65,54
5,70,43,4,18,75,0,2,92,91,22
6,89,38,37,80,1,64,0,8,82,48
7,80,25,92,75,98,6,90,0,38,96
8,9,26,80,25,96,54,57,80,0,3
9,44,39,30,90,23,97,3,63,44,0


In [47]:
for x,y in list(zip(df.index,-df.columns-1)):
    df.iloc[x,y] = 0
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,24,45,47,53,12,50,87,25,0
1,4,0,44,32,60,95,7,3,0,0
2,75,98,0,95,13,64,3,0,0,34
3,41,24,85,0,43,18,0,0,89,68
4,93,58,40,49,0,0,0,6,65,54
5,70,43,4,18,0,0,2,92,91,22
6,89,38,37,0,0,64,0,8,82,48
7,80,25,0,0,98,6,90,0,38,96
8,9,0,0,25,96,54,57,80,0,3
9,0,0,30,90,23,97,3,63,44,0


In [44]:
for i in range(df.shape[0]):
    df.iat[i, i] = 0
    df.iat[df.shape[0]-i-1, i] = 0

#### How to create a column that contains the penultimate value in each row?

In [9]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,83,36,47,41,50,89,12,49,30,19
1,60,99,23,95,13,51,56,22,41,14
2,58,32,21,96,15,88,12,1,8,85
3,72,56,5,35,5,42,8,53,78,31
4,84,58,37,55,25,54,46,28,46,35
5,11,33,6,9,24,8,39,43,71,16
6,1,51,39,44,27,9,15,47,67,56
7,48,41,72,62,55,12,63,24,91,69


In [49]:
df.apply(lambda x:x.sort_values().iloc[-2],axis = 1)

0    86
1    67
2    84
3    80
4    68
5    95
6    50
7    54
dtype: int64

#### Which column contains the highest number of row-wise maximum values?

In [14]:
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))
df

Unnamed: 0,0,1,2,3
0,1,57,80,47
1,92,33,52,12
2,46,44,80,18
3,5,65,37,64
4,47,67,95,1
5,73,89,50,4
6,82,68,37,29
7,60,78,49,60
8,74,52,2,20
9,97,58,17,80


In [35]:
bool = df.rank(axis = 1) ==  df.shape[1]
bool.sum()

0    4
1    3
2    3
3    0
dtype: int64

In [36]:
df.apply(np.argmax,axis =1).value_counts()

0    4
2    3
1    3
dtype: int64

#### Which column contains the highest row-wise maximum values?

In [37]:
### np.argmax is a numpy function 
x = df[[1,2]]
np.argmax(x.values,axis=0)

array([5, 4], dtype=int64)

In [5]:
df.apply(np.argmax,axis =1)

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return getattr(obj, method)(*args, **kwds)


0    3
1    2
2    0
3    3
4    2
5    0
6    3
7    1
8    0
9    3
dtype: int64

#### Which columns(not values) contains the 2nd highest row-wise maximum values?

In [39]:
np.random.seed(123)

df = pd.DataFrame(np.random.choice(100,50).reshape(5,10),columns=list('abcdefghij'))
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,66,92,98,17,83,57,86,97,96,47
1,73,32,46,96,25,83,78,36,96,80
2,68,49,55,67,2,84,39,66,84,47
3,61,48,7,99,92,52,97,85,94,27
4,34,97,76,40,3,69,64,75,34,58


In [40]:
df.rank(method='first',axis =1)

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,4.0,7.0,10.0,1.0,5.0,3.0,6.0,9.0,8.0,2.0
1,5.0,2.0,4.0,9.0,1.0,8.0,6.0,3.0,10.0,7.0
2,8.0,4.0,5.0,7.0,1.0,9.0,2.0,6.0,10.0,3.0
3,5.0,3.0,1.0,10.0,7.0,4.0,9.0,6.0,8.0,2.0
4,2.0,10.0,9.0,4.0,1.0,7.0,6.0,8.0,3.0,5.0


In [45]:
df.columns[np.where(df.rank(method='first',axis =1) == 9)[1]]

Index(['h', 'd', 'f', 'g', 'c'], dtype='object')

In [43]:
# alternative
df.apply(lambda x:x.sort_values(ascending = False).iloc[[1]].index[0],axis = 1)

0    h
1    d
2    f
3    g
4    c
dtype: object

#### How to reverse the rows of a dataframe?

In [65]:
df = pd.DataFrame(np.arange(25).reshape(5, -1))
df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [66]:
df.iloc[::-1, :]

Unnamed: 0,0,1,2,3,4
4,20,21,22,23,24
3,15,16,17,18,19
2,10,11,12,13,14
1,5,6,7,8,9
0,0,1,2,3,4


#### How to reshape a dataframe to the largest possible square after removing the negative values? ----- LEFT

In [82]:
df = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10,-1))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-19,17,35,14,-14,-20,-19,4,29,-2
1,44,46,29,9,40,28,43,-8,-12,47
2,-19,45,45,-15,5,-18,34,8,-8,-7
3,22,24,-1,-20,18,15,2,18,22,8
4,-3,16,4,-8,15,15,11,47,-14,-15
5,25,-11,20,30,2,21,41,31,33,1
6,-1,-1,45,-9,39,27,49,24,7,-1
7,11,-16,-5,26,23,37,47,34,37,4
8,18,12,-7,19,25,19,41,-1,-4,39
9,-1,0,-17,45,47,1,-2,49,6,-9


- flatten makes a 2d array to 1d array

In [84]:
 df[df > 0].values

array([[nan, 17., 35., 14., nan, nan, nan,  4., 29., nan],
       [44., 46., 29.,  9., 40., 28., 43., nan, nan, 47.],
       [nan, 45., 45., nan,  5., nan, 34.,  8., nan, nan],
       [22., 24., nan, nan, 18., 15.,  2., 18., 22.,  8.],
       [nan, 16.,  4., nan, 15., 15., 11., 47., nan, nan],
       [25., nan, 20., 30.,  2., 21., 41., 31., 33.,  1.],
       [nan, nan, 45., nan, 39., 27., 49., 24.,  7., nan],
       [11., nan, nan, 26., 23., 37., 47., 34., 37.,  4.],
       [18., 12., nan, 19., 25., 19., 41., nan, nan, 39.],
       [nan, nan, nan, 45., 47.,  1., nan, 49.,  6., nan]])

In [81]:
arr = df[df > 0].values.flatten()
arr_qualified = arr[~np.isnan(arr)]
arr_qualified

array([28., 42., 15., 41., 48., 38.,  2., 18., 33., 28., 12., 48.,  8.,
       15., 41., 18., 12.,  3.,  1., 40., 49., 26., 29.,  3., 42., 29.,
       17.,  9.,  7., 13., 37., 14.,  9., 24.,  4., 17., 45.,  3., 47.,
       31., 11., 37., 44.,  7.,  3., 42., 20., 22., 27., 32., 20., 31.,
       42., 40., 29., 27., 16., 46., 23., 34., 25., 25., 13., 17., 19.,
       27.,  9., 23., 12., 18., 17.,  8.])

In [93]:
ser = pd.Series(np.logspace(-2, 2, 30))
ser

0       0.010000
1       0.013738
2       0.018874
3       0.025929
4       0.035622
5       0.048939
6       0.067234
7       0.092367
8       0.126896
9       0.174333
10      0.239503
11      0.329034
12      0.452035
13      0.621017
14      0.853168
15      1.172102
16      1.610262
17      2.212216
18      3.039195
19      4.175319
20      5.736153
21      7.880463
22     10.826367
23     14.873521
24     20.433597
25     28.072162
26     38.566204
27     52.983169
28     72.789538
29    100.000000
dtype: float64

#### How to get the last n rows of a dataframe with row sum > 100?

In [41]:
df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))
df

Unnamed: 0,0,1,2,3
0,28,28,16,38
1,32,26,37,13
2,20,17,18,11
3,36,11,37,31
4,39,33,14,23
5,30,21,18,27
6,17,39,20,36
7,14,29,29,15
8,17,11,16,20
9,19,13,39,39


In [99]:
df[df.apply(sum,axis=1) > 100]

Unnamed: 0,0,1,2,3
0,39,32,30,37
1,30,29,35,33
3,36,29,10,35
4,25,18,20,39
6,36,19,32,32
12,36,19,38,35
13,30,20,32,19


#### How to find the position of the nth largest value greater than a given value?

In [101]:
ser = pd.Series(np.random.randint(1, 100, 15))
ser

0     68
1     10
2     17
3     32
4     14
5     64
6     74
7     86
8     42
9     93
10    65
11    73
12    48
13    54
14    66
dtype: int64

In [114]:
ser[ser > ser.mean()].sort_values().index[1]

5

In [107]:
np.argwhere(ser > ser.mean())[1]

array([5])

#### How to get the row number of the nth largest value in a column?

In [13]:
np.random.seed(123)
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))
df

Unnamed: 0,a,b,c
0,14,3,29
1,3,7,18
2,20,11,28
3,26,23,2
4,1,18,16
5,10,1,15
6,1,16,26
7,20,15,5
8,1,17,5
9,18,24,4


In [18]:
df.apply(np.argsort,axis = 1)

Unnamed: 0,a,b,c
0,1,0,2
1,0,1,2
2,1,0,2
3,2,1,0
4,0,2,1
5,1,0,2
6,0,1,2
7,2,1,0
8,0,2,1
9,2,0,1


In [51]:
### Argsort() -- imp
df[df.a.argsort() ==5]

Unnamed: 0,a,b,c
4,1,18,16


In [47]:
df.apply(lambda x:x.sort_values(ascending = False).iloc[[1]].index[0],axis = 0)

a    2
b    0
c    4
d    1
e    0
f    1
g    0
h    3
i    0
j    4
dtype: int64

In [53]:
df.a.argsort()[::-1][5]

0

#### How to create a primary key index by combining relevant columns?

In [139]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', usecols=[0,1,2,3,5])
df.head(2)

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
0,Acura,Integra,Small,12.9,18.8
1,,Legend,Midsize,29.2,38.7


In [142]:
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna('missing')
df.index = df.Manufacturer + '_' + df.Model + '_' + df.Type
df.head(2)

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
Acura_Integra_Small,Acura,Integra,Small,12.9,18.8
missing_Legend_Midsize,missing,Legend,Midsize,29.2,38.7


#### How to filter every nth row in a dataframe?

In [43]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
df.head(2)


Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend


In [170]:
df.iloc[np.where(df.index%20 == 0)].head(2)

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
20,Chrysler,LeBaron,Compact,14.5,15.8,17.1,23.0,28.0,Driver & Passenger,Front,...,6.0,183.0,104.0,68.0,41.0,30.5,14.0,3085.0,USA,Chrysler LeBaron


- using :: in pandas

https://stackoverflow.com/questions/3453085/what-is-double-colon-in-python-when-subscripting-sequences

s[i:j:k] is, according to the documentation, "slice of s from i to j with step k". When i and j are absent, the whole sequence is assumed and thus s[::k] means "every k-th item".

In [174]:
df.iloc[::-2,:]

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
92,,850,Midsize,24.8,26.7,28.5,20.0,28.0,Driver & Passenger,Front,...,5.0,184.0,105.0,69.0,38.0,30.0,15.0,3245.0,non-USA,Volvo 850
90,Volkswagen,Corrado,Sporty,22.9,23.3,23.7,18.0,25.0,,Front,...,4.0,159.0,97.0,66.0,36.0,26.0,15.0,2810.0,non-USA,Volkswagen Corrado
88,Volkswagen,Eurovan,Van,16.6,19.7,22.7,17.0,21.0,,Front,...,7.0,187.0,115.0,72.0,38.0,34.0,,3960.0,,Volkswagen Eurovan
86,Toyota,Previa,Van,,22.7,26.6,18.0,22.0,Driver only,4WD,...,7.0,187.0,113.0,71.0,41.0,35.0,,3785.0,non-USA,Toyota Previa
84,Toyota,Celica,Sporty,14.2,18.4,22.6,25.0,32.0,,Front,...,4.0,174.0,99.0,69.0,,23.0,13.0,2950.0,non-USA,Toyota Celica
82,Suzuki,Swift,,7.3,8.6,,39.0,43.0,,Front,...,4.0,161.0,93.0,,34.0,27.5,10.0,1965.0,non-USA,Suzuki Swift
80,Subaru,Loyale,Small,10.5,10.9,11.3,25.0,30.0,,4WD,...,5.0,175.0,97.0,65.0,35.0,27.5,15.0,2490.0,non-USA,Subaru Loyale
78,Saturn,SL,Small,9.2,,12.9,,38.0,Driver only,Front,...,5.0,176.0,102.0,68.0,40.0,26.5,,2495.0,USA,Saturn SL
76,Pontiac,Bonneville,Large,19.4,24.4,29.4,19.0,28.0,Driver & Passenger,Front,...,6.0,177.0,111.0,74.0,43.0,30.5,18.0,3495.0,USA,Pontiac Bonneville
74,Pontiac,Firebird,,14.0,17.7,21.4,19.0,28.0,Driver & Passenger,Rear,...,4.0,196.0,101.0,75.0,43.0,25.0,13.0,3240.0,USA,Pontiac Firebird


#### How to convert the first character of each element in a series to uppercase?

In [200]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser

0     how
1      to
2    kick
3    ass?
dtype: object

In [203]:
ser.str.title()

0     How
1      To
2    Kick
3    Ass?
dtype: object

####  How to get the positions of items of series A in another series B?

In [48]:
# Input
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# Solution 1
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

# Solution 2
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

In [80]:
for index,value in zip(ser1.index,ser1):
    if value in ser2.values:
        print(index,value)

0 10
4 3
5 1
8 13


In [77]:
for x in set(ser1) & set(ser2):
    print(x)
    #print(ser1[ser1 == x].index[0])

1
10
3
13


In [9]:
li = []
for x in ser2:
    t = pd.Index(ser1).get_loc(x)
    li.append(t)

In [10]:
li

[5, 4, 0, 8]

#### How to compute difference of differences between consequtive numbers of a series?

In [205]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
ser

0     1
1     3
2     6
3    10
4    15
5    21
6    27
7    35
dtype: int64

In [176]:
print(ser.diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]


#### How to replace missing spaces in a string with the least frequent character?

In [211]:
y_str = 'dbc deb abed gade'
y_str

'dbc deb abed gade'

In [237]:
x = pd.Series(list(y_str))
y = np.where(x == ' ',x.value_counts().index[-1],x).tolist()
np.hstack(y)

array(['d', 'b', 'c', 'c', 'd', 'e', 'b', 'c', 'a', 'b', 'e', 'd', 'c',
       'g', 'a', 'd', 'e'], dtype='<U1')

#### How to import only specified columns from a csv file?

In [27]:
f = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', usecols=['crim', 'medv'])
print(df.head())

   0  1  2  3
0  0  1  2  3


#### How to change the order of columns of a dataframe?

In [54]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [69]:
def switch_columns(df, col1=None, col2=None):
    colnames = df.columns.tolist()
    i1, i2 = colnames.index(col1), colnames.index(col2)
    colnames[i2], colnames[i1] = colnames[i1], colnames[i2]
    return df[colnames]

In [71]:
df1 = switch_columns(df, 'a', 'c')
df1

Unnamed: 0,c,b,a,d,e
0,2,1,0,3,4
1,7,6,5,8,9
2,12,11,10,13,14
3,17,16,15,18,19


#### How to swap two rows of a dataframe?

In [104]:
np.random.seed(123)
df = pd.DataFrame(np.arange(25).reshape(5, -1))

df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [105]:
np.random.seed(123)
def swap_rows(df, i1, i2):
    a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy()
    df.iloc[i1, :], df.iloc[i2, :] = b, a
    return df

swap_rows(df, 1, 2)

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,10,11,12,13,14
2,5,6,7,8,9
3,15,16,17,18,19
4,20,21,22,23,24


#### How to create a column that contains the penultimate value in each row?

In [83]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,41,16,73,23,44,83,76,8,35,50
1,96,76,86,48,64,32,91,21,38,40
2,68,5,43,52,39,34,59,68,70,89
3,69,47,71,96,84,32,67,81,53,77
4,51,5,91,64,80,50,40,47,9,51
5,16,9,18,23,74,58,91,63,84,97
6,44,33,27,9,77,11,41,35,61,10
7,71,87,71,20,57,83,2,69,41,82


In [92]:
# using sort_values & iloc(not loc)
df.apply(lambda x:x.sort_values().iloc[-2],axis= 1)

0    76
1    91
2    70
3    84
4    80
5    91
6    61
7    83
dtype: int64

In [89]:
# using np.sort
df.apply(lambda x:np.sort(x)[-2],axis= 1)

0    76
1    91
2    70
3    84
4    80
5    91
6    61
7    83
dtype: int64

In [43]:
out = df.apply(lambda x: x.sort_values().unique()[-2], axis=1)
df['penultimate'] = out
print(df)

    0   1   2   3   4   5   6   7   8   9  penultimate
0  41  16  73  23  44  83  76   8  35  50           76
1  96  76  86  48  64  32  91  21  38  40           91
2  68   5  43  52  39  34  59  68  70  89           70
3  69  47  71  96  84  32  67  81  53  77           84
4  51   5  91  64  80  50  40  47   9  51           80
5  16   9  18  23  74  58  91  63  84  97           91
6  44  33  27   9  77  11  41  35  61  10           61
7  71  87  71  20  57  83   2  69  41  82           83


#### How to get the positions where values of two columns match?

In [44]:
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                    'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})

# Solution
np.where(df.fruit1 == df.fruit2)

(array([], dtype=int64),)

In [130]:
np.argwhere(df.fruit1 == df.fruit2)

array([[5],
       [7],
       [8],
       [9]])

####  How to get the frequency of unique values in the entire dataframe?

In [132]:
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), columns = list('abcd'))
print(df)
# Solution
pd.value_counts(df.values.ravel())

   a  b  c  d
0  9  9  5  5
1  1  2  1  8
2  6  5  8  3
3  4  6  6  5
4  3  7  8  5


5    5
8    3
6    3
9    2
3    2
1    2
7    1
4    1
2    1
dtype: int64

# 100 pandas puzzles

https://github.com/ajcr/100-pandas-puzzles/blob/master/100-pandas-puzzles.ipynb

#### 27. A DataFrame has a column of groups 'grps' and and column of numbers 'vals'. For example:

In [204]:
df = pd.DataFrame({'grps': list('aaabbcaabcccbbc'), 
                   'vals': [12,345,3,1,45,14,4,52,54,23,235,21,57,3,87]})
df

Unnamed: 0,grps,vals
0,a,12
1,a,345
2,a,3
3,b,1
4,b,45
5,c,14
6,a,4
7,a,52
8,b,54
9,c,23


In [203]:
grp = df.sort_values(['grps','vals']).groupby('grps').cumcount() 
df[grp > 1].groupby('grps').sum()

  


Unnamed: 0_level_0,vals
grps,Unnamed: 1_level_1
a,409
b,156
c,345


#### 30. Consider a DataFrame containing rows and columns of purely numerical data. Create a list of the row-column index locations of the 3 largest values.

In [79]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(1,500,50).reshape(10,5))
df

Unnamed: 0,0,1,2,3,4
0,169,16,494,73,23
1,300,467,76,361,264
2,163,434,96,76,214
3,48,320,288,219,405
4,125,117,294,168,196
5,103,261,427,364,52
6,360,39,491,34,59
7,381,256,324,326,89
8,197,303,455,352,212
9,122,32,451,81,53


In [241]:
## column * row

df.unstack().sort_values()

1  0     16
4  0     23
1  9     32
3  6     34
1  6     39
0  3     48
4  5     52
   9     53
   6     59
3  0     73
   2     76
2  1     76
3  9     81
4  7     89
2  2     96
0  5    103
1  4    117
0  9    122
   4    125
   2    163
3  4    168
0  0    169
4  4    196
0  8    197
4  8    212
   2    214
3  3    219
1  7    256
   5    261
4  1    264
2  3    288
   4    294
0  1    300
1  8    303
   3    320
2  7    324
3  7    326
   8    352
0  6    360
3  1    361
   5    364
0  7    381
4  3    405
2  5    427
1  2    434
2  9    451
   8    455
1  1    467
2  6    491
   0    494
dtype: int64

In [146]:
df.unstack().sort_values()[-3:].index.tolist()

[(1, 1), (2, 6), (2, 0)]

#### 32. Implement a rolling mean over groups with window size 3, which ignores NaN value. For example consider the following DataFrame:

In [97]:
df = pd.DataFrame({'group_': list('aabbabbbabab'),
                       'value': [1, 2, 3, np.nan, 2, 3, 
                                 np.nan, 1, 7, 3, np.nan, 8]})

In [98]:
df

Unnamed: 0,group_,value
0,a,1.0
1,a,2.0
2,b,3.0
3,b,
4,a,2.0
5,b,3.0
6,b,
7,b,1.0
8,a,7.0
9,b,3.0


In [104]:
df.fillna(method='ffill').value.rolling(window = 3).sum()

0      NaN
1      NaN
2      6.0
3      8.0
4      8.0
5      8.0
6      8.0
7      7.0
8     11.0
9     11.0
10    13.0
11    14.0
Name: value, dtype: float64

In [291]:
pd.DataFrame(np.where(df.group_ == 'a',df.value.fillna(3),np.where(df.group_ == 'b',df.value.fillna(3.6),"")),columns=['filler']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
filler,1.0,2.0,3.0,3.6,2.0,3.0,3.6,1.0,7.0,3.0,3.0,8.0


#### 42. In the Airline column, you can see some extra puctuation and symbols have appeared around the airline names. Pull out just the airline name. E.g. '(British Airways. )' should become 'British Airways'.

In [292]:
df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm', 
                               'Budapest_PaRis', 'Brussels_londOn'],
              'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
              'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
                   'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )', 
                               '12. Air France', '"Swiss Air"']})

In [293]:
df

Unnamed: 0,From_To,FlightNumber,RecentDelays,Airline
0,LoNDon_paris,10045.0,"[23, 47]",KLM(!)
1,MAdrid_miLAN,,[],<Air France> (12)
2,londON_StockhOlm,10065.0,"[24, 43, 87]",(British Airways. )
3,Budapest_PaRis,,[13],12. Air France
4,Brussels_londOn,10085.0,"[67, 32]","""Swiss Air"""


## ---------------------------------------- OTHER ---------------------------------------------------------------

#### Constructing pandas DataFrame from values in variables gives “ValueError: If using all scalar values, you must pass an index”

In [105]:
df = pd.DataFrame({'A': [2], 'B': [1]})
df

Unnamed: 0,A,B
0,2,1


-  Must pass index when passing values as scaler

In [108]:
df = pd.DataFrame({'A': 2, 'B': 1}, index=[0])
df

Unnamed: 0,A,B
0,2,1


#### Re-ordering columns in pandas dataframe based on column name

In [109]:
df

Unnamed: 0,A,B
0,2,1


In [128]:
df = pd.DataFrame(data = 1,index = [0],columns= list('eafvghtj'))
df

Unnamed: 0,e,a,f,v,g,h,t,j
0,1,1,1,1,1,1,1,1


In [129]:
df.sort_index(axis=1)

Unnamed: 0,a,e,f,g,h,j,t,v
0,1,1,1,1,1,1,1,1


#### Using partioning SQL concept in Python

In [148]:
df1 = pd.DataFrame( { 
    "Name" : ["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] , 
    "City" : ["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"] } )

df1

Unnamed: 0,Name,City
0,Alice,Seattle
1,Bob,Seattle
2,Mallory,Portland
3,Mallory,Seattle
4,Bob,Seattle
5,Mallory,Portland


In [149]:
df1.groupby(['Name','City']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,City
Name,City,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,Seattle,1,1
Bob,Seattle,1,1
Mallory,Portland,1,1
Mallory,Seattle,1,1


In [150]:
pd.DataFrame({'count' : df1.groupby( [ "Name", "City"] ).size()}).reset_index()

Unnamed: 0,Name,City,count
0,Alice,Seattle,1
1,Bob,Seattle,2
2,Mallory,Portland,2
3,Mallory,Seattle,1


#### concatinating based on column values

In [64]:
df1['concat'] = df1['Name'] + "-" + df1['City']
df1

Unnamed: 0,Name,City,concat
0,Alice,Seattle,Alice-Seattle
1,Bob,Seattle,Bob-Seattle
2,Mallory,Portland,Mallory-Portland
3,Mallory,Seattle,Mallory-Seattle
4,Bob,Seattle,Bob-Seattle
5,Mallory,Portland,Mallory-Portland


#### How to view all columns in a pandas dataframe

In [42]:
import pandas as pd
#pd.set_option('display.max_rows',25)
pd.set_option('display.max_columns',100 )
pd.set_option('display.width', 25)

In [45]:
pd.DataFrame(dict(zip(np.arange(100),list('a'*100))),index = np.arange(5))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a
1,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a
2,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a
3,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a
4,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a


#### Dropping rows with NULL vlaues

In [131]:
df = pd.DataFrame({'grps': list('aaabbcaabcccbbc'), 
                   'vals': [12,345,3,1,45,np.nan,4,52,np.nan,23,235,21,57,3,87]})
df

Unnamed: 0,grps,vals
0,a,12.0
1,a,345.0
2,a,3.0
3,b,1.0
4,b,45.0
5,c,
6,a,4.0
7,a,52.0
8,b,
9,c,23.0


In [152]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,grps,vals
5,c,
8,b,


#### np.where` vs apply(lamdba if else)

In [None]:
ser = np.where(cust2.age> 50 , 'senior', np.where(cust2.age>30, 'young','teen'))
ser1=  pd.Series(ser)


cust2['dummy_np.where'] = ser

t = cust2.age.apply(lambda x: 'senior' if x > 50 else ( 'young' if x > 30 else 'teen'))
#rint(t)
cust2['dummy_lambda']  = t

cust2.head()

#### Mapped Aggregation

In [8]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [10]:
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
df2.groupby(mapping).sum()

Unnamed: 0,data1,data2
consonant,12,19
vowel,3,8


#### AND vs &

In [32]:
True and False # work for only 1 argument at a time

False

In [53]:
ser  = pd.Series([1,0,1])
ser
(ser > 1) & (ser < 2) # works for an array of booleans

0    False
1    False
2    False
dtype: bool

In [52]:
try:
    (ser > 1) and (ser < 2)
except:
    print('error')

error


#### Creating a mapper for each inflection value

In [6]:
df = pd.DataFrame({'col':list('aaaaabbbbaaaabbbb')})
df

Unnamed: 0,col
0,a
1,a
2,a
3,a
4,a
5,b
6,b
7,b
8,b
9,a


In [9]:
df['mapper'] = df.ne(df.shift(1)).cumsum()

In [10]:
df

Unnamed: 0,col,mapper
0,a,1
1,a,1
2,a,1
3,a,1
4,a,1
5,b,2
6,b,2
7,b,2
8,b,2
9,a,3


#### Bootstraping in Pandas

In [140]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [147]:
df.sample(n=10,replace=True)

Unnamed: 0,key,data1,data2
5,C,5,9
3,A,3,3
1,B,1,0
5,C,5,9
2,C,2,3
3,A,3,3
4,B,4,7
3,A,3,3
3,A,3,3
1,B,1,0


#### Iterating over rows - iterrows

In [14]:
df.iterrows()

<generator object DataFrame.iterrows at 0x7f2b9d3a6d58>

In [148]:
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [153]:
for x in df.iterrows():
    print(x)

(0, key      A
data1    0
data2    5
Name: 0, dtype: object)
(1, key      B
data1    1
data2    0
Name: 1, dtype: object)
(2, key      C
data1    2
data2    3
Name: 2, dtype: object)
(3, key      A
data1    3
data2    3
Name: 3, dtype: object)
(4, key      B
data1    4
data2    7
Name: 4, dtype: object)
(5, key      C
data1    5
data2    9
Name: 5, dtype: object)


In [149]:
for x,y in df.iterrows():
    print(df.data1[x],df.data1[y])

0 0
A    NaN
0    0.0
5    5.0
Name: data1, dtype: float64
1 1
B    NaN
1    1.0
0    0.0
Name: data1, dtype: float64
2 2
C    NaN
2    2.0
3    3.0
Name: data1, dtype: float64
3 3
A    NaN
3    3.0
3    3.0
Name: data1, dtype: float64
4 4
B    NaN
4    4.0
7    NaN
Name: data1, dtype: float64
5 5
C    NaN
5    5.0
9    NaN
Name: data1, dtype: float64


#### Rolling mean with a window of 3

In [162]:
rng = np.random.RandomState(0)

df = pd.DataFrame({'key': list('A'*3 + 'B'*4 + 'C'*3),
                   'data1': np.arange(10)})
df

Unnamed: 0,key,data1
0,A,0
1,A,1
2,A,2
3,B,3
4,B,4
5,B,5
6,B,6
7,C,7
8,C,8
9,C,9


In [168]:
df['data1'].rolling(window= 3).mean()

0    NaN
1    NaN
2    1.0
3    2.0
4    3.0
5    4.0
6    5.0
7    6.0
8    7.0
9    8.0
Name: data1, dtype: float64

#### Ranking

In [172]:
df.rank()

Unnamed: 0,key,data1
0,2.0,1.0
1,2.0,2.0
2,2.0,3.0
3,5.5,4.0
4,5.5,5.0
5,5.5,6.0
6,5.5,7.0
7,9.0,8.0
8,9.0,9.0
9,9.0,10.0


#### Converting a Dataframe to a dictionary/Json

In [3]:
rng = np.random.RandomState(0)

df = pd.DataFrame({'key': list('A'*3 + 'B'*4 + 'C'*3),
                   'data1': np.arange(10)})
df

Unnamed: 0,key,data1
0,A,0
1,A,1
2,A,2
3,B,3
4,B,4
5,B,5
6,B,6
7,C,7
8,C,8
9,C,9


In [5]:
df.to_dict()

{'key': {0: 'A',
  1: 'A',
  2: 'A',
  3: 'B',
  4: 'B',
  5: 'B',
  6: 'B',
  7: 'C',
  8: 'C',
  9: 'C'},
 'data1': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}}

In [6]:
d = df.to_dict()
for x,y in d.items():
    print(x)

key
data1


In [7]:
d['key']

{0: 'A',
 1: 'A',
 2: 'A',
 3: 'B',
 4: 'B',
 5: 'B',
 6: 'B',
 7: 'C',
 8: 'C',
 9: 'C'}

#### Imputing multiple columns @ once

In [18]:
df = pd.DataFrame({'A':[1,2,np.nan],'B':[1,4,np.nan],'C':[1,5,np.nan]})
df

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,2.0,4.0,5.0
2,,,


In [17]:
df.mean()

A    1.5
B    2.5
C    3.0
dtype: float64

In [16]:
df.fillna(df.mean())

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,2.0,4.0,5.0
2,1.5,2.5,3.0


#### Numpy -  Append / Concatenate / vstack / hstack / column_stack

- Append takes in argument directly as array or list

In [16]:
np.append([[1, 2, 3], [4, 5, 6]], [[7, 8, 9]], axis=0)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [17]:
a = [[1, 2, 3], [4, 5, 6]]
b= [[7, 8, 9]]

- Takes argument as tuple

In [58]:
np.vstack((a,b))

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [60]:
np.hstack(([1,2,3],[5,6,7]))

array([1, 2, 3, 5, 6, 7])

In [56]:
np.column_stack(([1,2,3],[5,6,7]))

array([[1, 5],
       [2, 6],
       [3, 7]])

#### Equals vs ==

In [10]:
df1 = pd.DataFrame({'A':[1,2,np.nan]})
df2 = pd.DataFrame({'B':[1,2,np.nan]})

In [13]:
try:
    df1 == df2
except:
    print('please use df1.equals(df2)')

please use df1.equals(df2)


In [11]:
df1.equals(df2)

False

#### Enumerate

In [137]:
for x in enumerate([3,45,78]):
    print(x)

(0, 3)
(1, 45)
(2, 78)


#### Stack & Unstack

In [195]:
rng = np.random.RandomState(0)

df = pd.DataFrame({'data2': np.arange(10)*2,
                   'data1': np.arange(10)},index = list('abcdefghij'))
df.head(2)

Unnamed: 0,data2,data1
a,0,0
b,2,1


In [187]:
df.unstack()

data2  a     0
       b     2
       c     4
       d     6
       e     8
       f    10
       g    12
       h    14
       i    16
       j    18
data1  a     0
       b     1
       c     2
       d     3
       e     4
       f     5
       g     6
       h     7
       i     8
       j     9
dtype: int32

In [188]:
df.stack()

a  data2     0
   data1     0
b  data2     2
   data1     1
c  data2     4
   data1     2
d  data2     6
   data1     3
e  data2     8
   data1     4
f  data2    10
   data1     5
g  data2    12
   data1     6
h  data2    14
   data1     7
i  data2    16
   data1     8
j  data2    18
   data1     9
dtype: int32

#### 17. Expand a Series of lists into a DataFrame - Kevin Markham trick

In [193]:
df = pd.DataFrame({'col_one':['a', 'b', 'c'], 'col_two':[[10, 40], [20, 50], [30, 60]]})
df

Unnamed: 0,col_one,col_two
0,a,"[10, 40]"
1,b,"[20, 50]"
2,c,"[30, 60]"


In [194]:
df_new = df.col_two.apply(pd.Series)
df_new

Unnamed: 0,0,1
0,10,40
1,20,50
2,30,60


In [205]:
a = pd.DataFrame(np.random.rand(6,4))
a

Unnamed: 0,0,1,2,3
0,0.18559,0.132092,0.009972,0.373041
1,0.010801,0.848462,0.243167,0.680027
2,0.67714,0.885827,0.153566,0.946694
3,0.293744,0.736601,0.475539,0.164867
4,0.827453,0.539698,0.354172,0.841998
5,0.031805,0.067735,0.628181,0.634944


In [206]:
a['k'] = list('abacab')

In [207]:
a

Unnamed: 0,0,1,2,3,k
0,0.18559,0.132092,0.009972,0.373041,a
1,0.010801,0.848462,0.243167,0.680027,b
2,0.67714,0.885827,0.153566,0.946694,a
3,0.293744,0.736601,0.475539,0.164867,c
4,0.827453,0.539698,0.354172,0.841998,a
5,0.031805,0.067735,0.628181,0.634944,b


In [211]:
a.groupby('k')[0].transform('sum') # works like partitiion

0    1.690183
1    0.042606
2    1.690183
3    0.293744
4    1.690183
5    0.042606
Name: 0, dtype: float64

#### Stack vs Unstack

In [217]:
df = pd.DataFrame(np.random.rand(4,5),columns=list('abcde'))
df

Unnamed: 0,a,b,c,d,e
0,0.242338,0.206619,0.424409,0.717935,0.851672
1,0.350724,0.363295,0.336293,0.406178,0.060308
2,0.128845,0.518525,0.883522,0.163339,0.728063
3,0.217907,0.834966,0.100676,0.365478,0.589097


In [222]:
df.corr()

Unnamed: 0,a,b,c,d,e
a,1.0,-0.349383,-0.594344,0.458142,-0.776067
b,-0.349383,1.0,-0.329541,-0.591253,-0.016446
c,-0.594344,-0.329541,1.0,-0.43734,0.326526
d,0.458142,-0.591253,-0.43734,1.0,0.204662
e,-0.776067,-0.016446,0.326526,0.204662,1.0


In [223]:
df1 = df.corr().stack()
df1

a  a    1.000000
   b   -0.349383
   c   -0.594344
   d    0.458142
   e   -0.776067
b  a   -0.349383
   b    1.000000
   c   -0.329541
   d   -0.591253
   e   -0.016446
c  a   -0.594344
   b   -0.329541
   c    1.000000
   d   -0.437340
   e    0.326526
d  a    0.458142
   b   -0.591253
   c   -0.437340
   d    1.000000
   e    0.204662
e  a   -0.776067
   b   -0.016446
   c    0.326526
   d    0.204662
   e    1.000000
dtype: float64

In [224]:
df1.unstack()

Unnamed: 0,a,b,c,d,e
a,1.0,-0.349383,-0.594344,0.458142,-0.776067
b,-0.349383,1.0,-0.329541,-0.591253,-0.016446
c,-0.594344,-0.329541,1.0,-0.43734,0.326526
d,0.458142,-0.591253,-0.43734,1.0,0.204662
e,-0.776067,-0.016446,0.326526,0.204662,1.0


#### Collections --> Counter

In [2]:
from collections import OrderedDict,Counter

Counter([1,2,3,2])

Counter({1: 1, 2: 2, 3: 1})

In [4]:
Counter('rohana')

Counter({'r': 1, 'o': 1, 'h': 1, 'a': 2, 'n': 1})