In [2]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [3]:
# Let's make a dframe

dframe1 = DataFrame({'key':['X','Z','Y','Z','X','X'],'data_set_1': np.arange(6)})

#Show
dframe1

Unnamed: 0,key,data_set_1
0,X,0
1,Z,1
2,Y,2
3,Z,3
4,X,4
5,X,5


# merge data sets by linking rows by keys.

In [4]:
dframe2 = DataFrame({'key':['Q','Y','Z'],'data_set_2':[1,2,3]})

#Show
dframe2

Unnamed: 0,key,data_set_2
0,Q,1
1,Y,2
2,Z,3


In [5]:
# Now we can use merge the dataframes, this is a "many-to-one" situation

# Merge will automatically choose overlapping columns to merge on
pd.merge(dframe1,dframe2)

#Note no overlapping 'X's

Unnamed: 0,key,data_set_1,data_set_2
0,Z,1,3
1,Z,3,3
2,Y,2,2


In [6]:
# We could have also specified which column to merge on
pd.merge(dframe1,dframe2,on='key')

Unnamed: 0,key,data_set_1,data_set_2
0,Z,1,3
1,Z,3,3
2,Y,2,2


In [7]:
# We can choose which DataFrame's keys to use, 
pd.merge(dframe1,dframe2,on='key',how='left')   #this will choose left (dframe1)

Unnamed: 0,key,data_set_1,data_set_2
0,X,0,
1,Z,1,3.0
2,Y,2,2.0
3,Z,3,3.0
4,X,4,
5,X,5,


In [8]:
# Choosing the one on the right (dframe2)
pd.merge(dframe1,dframe2,on='key',how='right')

Unnamed: 0,key,data_set_1,data_set_2
0,Z,1.0,3
1,Z,3.0,3
2,Y,2.0,2
3,Q,,1


In [11]:
#Choosing the "outer" method selects the union of both keys
pd.merge(dframe1,dframe2,on='key',how='outer')

Unnamed: 0,key,data_set_1,data_set_2
0,X,0.0,
1,X,4.0,
2,X,5.0,
3,Z,1.0,3.0
4,Z,3.0,3.0
5,Y,2.0,2.0
6,Q,,1.0


In [12]:
#Now we'll learn about a many to many merge

# Nnote that these DataFrames contain more than one instance of the key in BOTH datasets

dframe3 = DataFrame({'key': ['X', 'X', 'X', 'Y', 'Z', 'Z'],
                 'data_set_3': range(6)})
dframe4 = DataFrame({'key': ['Y', 'Y', 'X', 'X', 'Z'],
                 'data_set_4': range(5)})

#Show the merge
pd.merge(dframe3, dframe4)

Unnamed: 0,key,data_set_3,data_set_4
0,X,0,2
1,X,0,3
2,X,1,2
3,X,1,3
4,X,2,2
5,X,2,3
6,Y,3,0
7,Y,3,1
8,Z,4,4
9,Z,5,4


In [3]:
# Lets get two dframes

df_left = DataFrame({'key': ['X','Y','Z','X','Y'],
                  'data': range(5)})
df_right = DataFrame({'group_data': [10, 20]}, index=['X', 'Y'])

In [5]:
#Show
df_left

Unnamed: 0,key,data
0,X,0
1,Y,1
2,Z,2
3,X,3
4,Y,4


In [6]:
#Show
df_right

Unnamed: 0,group_data
X,10
Y,20


In [4]:
#Now merge, we'll use the key for the left Dframe, and the index for the right
pd.merge(df_left,df_right,left_on='key',right_index=True)

Unnamed: 0,key,data,group_data
0,X,0,10
3,X,3,10
1,Y,1,20
4,Y,4,20


In [13]:
# We can also get a union by using outer
pd.merge(df_left,df_right,left_on='key',right_index=True,how='outer')

KeyError: 'key'

In [8]:
#Now let's try something a little more complicated, remember hierarchal index?
df_left_hr = DataFrame({'key1': ['SF','SF','SF','LA','LA'],
                   'key2': [10, 20, 30, 20, 30],
                   'data_set': np.arange(5.)})
df_right_hr = DataFrame(np.arange(10).reshape((5, 2)),
                   index=[['LA','LA','SF','SF','SF'],
                          [20, 10, 10, 10, 20]],
                   columns=['col_1', 'col_2'])

In [9]:
df_left_hr


Unnamed: 0,key1,key2,data_set
0,SF,10,0.0
1,SF,20,1.0
2,SF,30,2.0
3,LA,20,3.0
4,LA,30,4.0


In [10]:
df_right_hr

Unnamed: 0,Unnamed: 1,col_1,col_2
LA,20,0,1
LA,10,2,3
SF,10,4,5
SF,10,6,7
SF,20,8,9


In [14]:
# Now we can merge the left by using keys and the right by its index
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True)

Unnamed: 0,key1,key2,data_set,col_1,col_2
0,SF,10,0.0,4,5
0,SF,10,0.0,6,7
1,SF,20,1.0,8,9
3,LA,20,3.0,0,1


In [15]:
# WE can also you .join()

# Shown on our first two DataFrames
df_left.join(df_right)

Unnamed: 0,key,data,group_data
0,X,0,
1,Y,1,
2,Z,2,
3,X,3,
4,Y,4,


# Concatenating along an axis

In [27]:
# Create a matrix 
arr1 = np.arange(9).reshape((3,3))

In [25]:
# Show
arr1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [18]:
# Concatenate along axis 1
np.concatenate([arr1,arr1],axis=1)

array([[0, 1, 2, 0, 1, 2],
       [3, 4, 5, 3, 4, 5],
       [6, 7, 8, 6, 7, 8]])

In [30]:
# Concatenate along axis 
np.concatenate([arr1,arr1])

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8],
       [0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [31]:
# Lets create two Series with no overlap
ser1 =  Series([0,1,2],index=['T','U','V'])

ser2 = Series([3,4],index=['X','Y'])

#Now let use concat (default is axis=0)
pd.concat([ser1,ser2])

T    0
U    1
V    2
X    3
Y    4
dtype: int64

In [33]:
# Now passing along another axis will produce a DataFrame
pd.concat([ser1,ser2],axis=1,sort=False)

Unnamed: 0,0,1
T,0.0,
U,1.0,
V,2.0,
X,,3.0
Y,,4.0


In [34]:
# We can specify which specific axes to be used
pd.concat([ser1,ser2],axis=1,join_axes=[['U','V','Y']])

Unnamed: 0,0,1
U,1.0,
V,2.0,
Y,,4.0


In [35]:
# Lets say we wanted to add markers.keys to the concatenation result

# WE can do this with a hierarchical index
pd.concat([ser1,ser2],keys=['cat1','cat2'])

cat1  T    0
      U    1
      V    2
cat2  X    3
      Y    4
dtype: int64

In [3]:
#Lastly, everything works similarly in DataFrames

dframe1 = DataFrame(np.random.randn(4,3), columns=['X', 'Y', 'Z'])
dframe2 = DataFrame(np.random.randn(3, 3), columns=['Y', 'Q', 'X'])

In [4]:
#Concat on DataFrame
pd.concat([dframe1,dframe2],sort=True)

Unnamed: 0,Q,X,Y,Z
0,,0.450764,0.875644,0.552891
1,,0.953899,0.887655,-1.112611
2,,-1.290645,0.456734,-1.243554
3,,-1.574505,-0.709658,-2.040342
0,-1.255287,0.075055,0.210953,
1,-0.633567,-1.019815,-0.56915,
2,-2.617753,-1.696945,0.272811,


In [5]:

#If we dont care about the index info and just awnt to make a complete DataFrame, just use ignore_index
pd.concat([dframe1,dframe2],sort=False,ignore_index=True)

Unnamed: 0,X,Y,Z,Q
0,0.450764,0.875644,0.552891,
1,0.953899,0.887655,-1.112611,
2,-1.290645,0.456734,-1.243554,
3,-1.574505,-0.709658,-2.040342,
4,0.075055,0.210953,,-1.255287
5,-1.019815,-0.56915,,-0.633567
6,-1.696945,0.272811,,-2.617753


# Combining dataframe

In [6]:
#Lets make some Series to work with

#First Series
ser1 = Series([2,np.nan,4,np.nan,6,np.nan],
           index=['Q','R','S','T','U','V'])

#Second Series (based off length of ser1)
ser2 = Series(np.arange(len(ser1), dtype=np.float64),
           index=['Q','R','S','T','U','V'])

ser2[-1] = np.nan

In [7]:
ser1

Q    2.0
R    NaN
S    4.0
T    NaN
U    6.0
V    NaN
dtype: float64

In [8]:
ser2

Q    0.0
R    1.0
S    2.0
T    3.0
U    4.0
V    NaN
dtype: float64

In [9]:
# Now let's get a series where the value of ser1 is chosen if ser2 is NAN,otherwise let the value be ser1
Series(np.where(pd.isnull(ser1),ser2,ser1),index=ser1.index)

Q    2.0
R    1.0
S    4.0
T    3.0
U    6.0
V    NaN
dtype: float64

In [10]:
#Now we can do the same thing simply by using combine_first with pandas
ser1.combine_first(ser2) #This combines the Series values, choosing the values of the calling Series first, unless its a NAN

Q    2.0
R    1.0
S    4.0
T    3.0
U    6.0
V    NaN
dtype: float64

In [11]:
#Lets make some 
dframe_odds = DataFrame({'X': [1., np.nan, 3., np.nan],
                     'Y': [np.nan, 5., np.nan, 7.],
                     'Z': [np.nan, 9., np.nan, 11.]})
dframe_evens = DataFrame({'X': [2., 4., np.nan, 6., 8.],
                     'Y': [np.nan, 10., 12., 14., 16.]})

In [12]:
#Now lets combine using odds values first, unless theres a NAN, then put the evens values
dframe_odds.combine_first(dframe_evens)

Unnamed: 0,X,Y,Z
0,1.0,,
1,4.0,5.0,9.0
2,3.0,12.0,
3,6.0,7.0,11.0
4,8.0,16.0,


# Reshaping DataFrames!


In [13]:
#Let's see how stack and unstack work

# Create DataFrame
dframe1 = DataFrame(np.arange(8).reshape((2, 4)),
                 index=pd.Index(['LA', 'SF'], name='city'),
                 columns=pd.Index(['A', 'B', 'C','D'], name='letter'))
#Show
dframe1

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [14]:
# Use stack to pivot the columns into the rows
dframe_st = dframe1.stack()

#Show
dframe_st

city  letter
LA    A         0
      B         1
      C         2
      D         3
SF    A         4
      B         5
      C         6
      D         7
dtype: int32

In [15]:
#We can always rearrange back into a DataFrame
dframe_st.unstack()


letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [16]:
#We can choose which level to unstack by
dframe_st.unstack(0)

city,LA,SF
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,5
C,2,6
D,3,7


In [17]:
# Also by which name to unstack by
dframe_st.unstack('letter')

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [18]:
# Let's see how stack and unstack handle NAN

#Make two series
ser1 = Series([0, 1, 2], index=['Q', 'X', 'Y'])
ser2 = Series([4, 5, 6], index=['X', 'Y', 'Z'])

#Concat to make a dframe
dframe = pd.concat([ser1, ser2], keys=['Alpha', 'Beta'])

# Unstack resulting DataFrame
#dframe.unstack()

In [19]:
dframe

Alpha  Q    0
       X    1
       Y    2
Beta   X    4
       Y    5
       Z    6
dtype: int64

In [20]:
# Unstack resulting DataFrame
dframe.unstack()

Unnamed: 0,Q,X,Y,Z
Alpha,0.0,1.0,2.0,
Beta,,4.0,5.0,6.0


In [21]:
# Now stack will filter out NAN by default
dframe.unstack().stack()

Alpha  Q    0.0
       X    1.0
       Y    2.0
Beta   X    4.0
       Y    5.0
       Z    6.0
dtype: float64

In [22]:
# IF we dont want this we can set it to False
dframe.unstack().stack(dropna=False)

Alpha  Q    0.0
       X    1.0
       Y    2.0
       Z    NaN
Beta   Q    NaN
       X    4.0
       Y    5.0
       Z    6.0
dtype: float64

# Pivoting

In [23]:
# Lets create some data to play with:

# Note: It is not necessary to understand how this dataset was made to understand this Lecture.

#import pandas testing utility
import pandas.util.testing as tm; tm.N = 3

#Create a unpivoted function
def unpivot(frame):
    N, K = frame.shape
    
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    
    # Return the DataFrame
    return DataFrame(data, columns=['date', 'variable', 'value'])

#Set the DataFrame we'll be using
dframe = unpivot(tm.makeTimeDataFrame())
#Show the "stacked" data, note how there are multiple variables and values for the dates
dframe

Unnamed: 0,date,variable,value
0,2000-01-03,A,0.077195
1,2000-01-04,A,-0.545362
2,2000-01-05,A,0.011301
3,2000-01-03,B,0.81329
4,2000-01-04,B,-0.976151
5,2000-01-05,B,2.028537
6,2000-01-03,C,0.46039
7,2000-01-04,C,0.302831
8,2000-01-05,C,-0.903114
9,2000-01-03,D,1.047712


In [24]:
# Now let's pivot the data

# First two values passed are the row and column indexes, then finally an optional fill value
dframe_piv = dframe.pivot('date','variable','value')

#Show
dframe_piv

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,0.077195,0.81329,0.46039,1.047712
2000-01-04,-0.545362,-0.976151,0.302831,0.2161
2000-01-05,0.011301,2.028537,-0.903114,-0.138047


# Duplicates in DataFrames!

In [25]:
#Lets get a dataframe with duplicates

dframe = DataFrame({'key1': ['A'] * 2 + ['B'] * 3,
                  'key2': [2, 2, 2, 3, 3]})

#Show
dframe

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [26]:
#Lets get a dataframe with duplicates

dframe = DataFrame({'key1': ['A'] * 2 + ['B'] * 3,
                  'key2': [2, 2, 2, 3, 3]})

#Show
dframe

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [27]:
#We can use duplicated to find duplicates
dframe.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [28]:
# We can also drop duplicates like this:
dframe.drop_duplicates()

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [32]:
#You can filter which duplicates to drop by a single column
dframe.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


# Mapping

In [33]:
# Let's create a dframe to work with (Highest elevation cities in USA)
dframe = DataFrame({'city':['Alma','Brian Head','Fox Park'],
                    'altitude':[3158,3000,2762]})

#Show
dframe

Unnamed: 0,city,altitude
0,Alma,3158
1,Brian Head,3000
2,Fox Park,2762


In [34]:
#Now let's say we wanted to add a column for the States, we can do that with a mapping.
state_map={'Alma':'Colorado','Brian Head':'Utah','Fox Park':'Wyoming'}

In [35]:
# Now we can map that data to our current dframe
dframe['state'] = dframe['city'].map(state_map)

In [36]:
#Show result
dframe

Unnamed: 0,city,altitude,state
0,Alma,3158,Colorado
1,Brian Head,3000,Utah
2,Fox Park,2762,Wyoming


# Replacing Values!

In [37]:
# Lets make  Series
ser1 = Series([1,2,3,4,1,2,3,4])
#Show
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [38]:
# Using replace we can select --> .replace(value to be replaced, new_value)
ser1.replace(1,np.nan)

0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64

In [39]:
#Can also input lists
ser1.replace([1,4],[100,400])

0    100
1      2
2      3
3    400
4    100
5      2
6      3
7    400
dtype: int64

In [40]:
#Can also input dictionary
ser1.replace({4:np.nan})

0    1.0
1    2.0
2    3.0
3    NaN
4    1.0
5    2.0
6    3.0
7    NaN
dtype: float64

# Renaming

In [41]:
# Making a DataFrame
dframe= DataFrame(np.arange(12).reshape((3, 4)),
                 index=['NY', 'LA', 'SF'],
                 columns=['A', 'B', 'C', 'D'])

#Show
dframe

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [42]:
# Just like a Series, axis indexes can also use map

#Let's use map to lowercase the city initials
dframe.index.map(str.lower)

Index(['ny', 'la', 'sf'], dtype='object')

In [43]:
# If you want to assign this to the actual index, you can use index
dframe.index = dframe.index.map(str.lower)
#Show
dframe

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [44]:
# Use rename if you want to create a transformed version withour modifying the original!

#str.title will capitalize the first letter, lowercasing the columns
dframe.rename(index=str.title, columns=str.lower)

Unnamed: 0,a,b,c,d
Ny,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


In [46]:
# We can also use rename to insert dictionaries providing new values for indexes or columns!
dframe.rename(index={'ny': 'NEW YORK'},
            columns={'A': 'ALPHA'})

Unnamed: 0,ALPHA,B,C,D
NEW YORK,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [47]:
# If you would like to actually edit the data set in place, set inplace=True
dframe.rename(index={'ny': 'NEW YORK'}, inplace=True)
dframe

Unnamed: 0,A,B,C,D
NEW YORK,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


# Binning!

In [49]:
years = [1990,1991,1992,2008,2012,2015,1987,1969,2013,2008,1999]

# We can seperate these years by decade
decade_bins = [1960,1970,1980,1990,2000,2010,2020]

#Now we'll use cut to get something called a Category object
decade_cat = pd.cut(years,decade_bins) 

In [51]:
#Show
decade_cat # shows the range in which these years fall

[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], (2010, 2020], ..., (1980, 1990], (1960, 1970], (2010, 2020], (2000, 2010], (1990, 2000]]
Length: 11
Categories (6, interval[int64]): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]

In [52]:
# We can check the categories using .categories
decade_cat.categories

IntervalIndex([(1960, 1970], (1970, 1980], (1980, 1990], (1990, 2000], (2000, 2010], (2010, 2020]]
              closed='right',
              dtype='interval[int64]')

In [53]:
# Then we can check the value counts in each category
pd.value_counts(decade_cat)

(2010, 2020]    3
(1990, 2000]    3
(2000, 2010]    2
(1980, 1990]    2
(1960, 1970]    1
(1970, 1980]    0
dtype: int64

In [54]:
# We can also pass data values to the cut.

#For instance, if we just wanted to make two bins, evenly spaced based on max and min year, with a 1 year precision
pd.cut(years,2,precision=1)

[(1969.0, 1992.0], (1969.0, 1992.0], (1969.0, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], ..., (1969.0, 1992.0], (1969.0, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], (1992.0, 2015.0]]
Length: 11
Categories (2, interval[float64]): [(1969.0, 1992.0] < (1992.0, 2015.0]]

In [55]:
# Thats about it for binning basics
# One last thing to note, jus tlike in standard math notation, when setting up bins:
# () means open, while [] means closed/inclusive

# Finding Outliers and Describing Data!

In [56]:
# Let's see how we would find outliers in a dataset

# First we'll seed the numpy generator
np.random.seed(12345)

#Next we'll create the dataframe
dframe = DataFrame(np.random.randn(1000,4))
#Show preview
dframe.head()

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.55573
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.43857


In [57]:
# Lets describe the data
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [58]:
# Lets select the first column
col = dframe[0]

In [59]:
# NOw we can check which values in the column are greater than 3, for instance.
col[np.abs(col)>3]

523   -3.428254
900    3.366626
Name: 0, dtype: float64

In [67]:
# So we now know in column[0], rows 523 and 900 have values with abs > 3

#How about all the columns?

# We can use the "any" method
dframe[(np.abs(dframe)>3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [68]:
# WE could also possibly cap the data at 3

dframe[np.abs(dframe)>3] = np.sign(dframe) *3
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


# Permutation!

In [69]:
# WE can randomly reorder (permutate) a Series, or the rows in a DataFrame

#Let's take a look
dframe = DataFrame(np.arange(4 * 4).reshape((4, 4)))

#Create an array with a random permutation of 0,1,2,3
blender = np.random.permutation(4)

blender

array([1, 3, 2, 0])

In [70]:
dframe

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [71]:
# Now permutate the dframe based on the blender
dframe.take(blender)

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3


In [73]:
# Now what if we want permuations WITH replacement
# Let imagine a box with 3 marbles in it: labeled 1, 2, and 3
box = np.array([1,2,3])

# Now lets create a random permuation WITH replacement using randint
shaker = np.random.randint(0, len(box), size=10)
# Let's check teh box "shaker"
shaker

array([2, 0, 2, 0, 0, 2, 1, 1, 2, 1])

In [75]:
#Now lets grab form the box
hand_grabs = box.take(shaker)

#show
hand_grabs

array([3, 1, 3, 1, 1, 3, 2, 2, 3, 2])