In [1]:
import pandas as pd
import numpy as np

Reproducibility: Any functions that generate random numbers in NumPy (such as numpy.random.rand() or numpy.random.randint()) will now generate the same random number sequence each time the code runs.


Fixed sequence: The random numbers generated are no longer "truly" random. NumPy will generate pseudorandom numbers based on an initial seed value of 101.


Rules out randomness: By setting the seed, we lose randomness, which may not be suitable for some cases. For example, in machine learning, we typically want to try models with different random weight initializations.

By setting a seed value for the random number generator in NumPy, what we are essentially doing is:

Initializing the random number generator to a known state:

The seed acts as an input to the random number generator algorithm that produces a sequence of pseudorandom numbers.
Setting the seed initializes the internal state of the algorithm.
Making the random sequence predictable and reproducible:

With a fixed seed, the random number generator will always produce the same sequence of pseudorandom numbers each time we run the code.
This makes the sequence reproducible - useful for testing, experimentation, and ensuring consistency between runs.
Removing "true" randomness:

With a fixed seed, the numbers generated are not truly random anymore, but follow a predictable pattern based on the algorithm and seed.
This may not be suitable in cases where we explicitly rely on randomness across trials.

In [2]:
from numpy.random import randn
np.random.seed(101)

In [20]:
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),
                  columns='W X Y Z'.split())

In [21]:
df

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049
E,-0.125381,-0.945588,2.029544,-1.046358


In [15]:
df1 = pd.DataFrame(randn(5,4),index=['A', 'B', 'C', 'D', 'E'],
                  columns=['W','X', 'Y', 'Z'])

# This is same as above code. Using split is easy and faster option

In [16]:
df1

Unnamed: 0,W,X,Y,Z
A,0.327845,0.674485,-0.174057,0.78014
B,-0.383258,-0.409318,0.343539,0.196275
C,-0.982776,2.231555,-0.971393,-1.522333
D,1.133703,0.528187,0.393461,-0.630507
E,-1.39829,-0.219311,-0.045676,0.012421


## Selection and Indexing

In [22]:
df['W']

A    0.093628
B   -0.380104
C    0.178009
D    1.130018
E   -0.125381
Name: W, dtype: float64

In [23]:
df[['W','Z']]

Unnamed: 0,W,Z
A,0.093628,-1.908009
B,-0.380104,1.522562
C,0.178009,1.743477
D,1.130018,-1.063049
E,-0.125381,-1.046358


In [24]:
df.W

A    0.093628
B   -0.380104
C    0.178009
D    1.130018
E   -0.125381
Name: W, dtype: float64

### DataFrame Columns are just Series

In [25]:
type(df['W'])

pandas.core.series.Series

#### Creating a new column

In [39]:
df['new'] = df['W'] + df['Y']

In [40]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.093628,1.240813,-1.097693,-1.908009,-1.004065
B,-0.380104,-1.666059,-2.736995,1.522562,-3.117098
C,0.178009,-0.626805,-0.391089,1.743477,-0.21308
D,1.130018,0.897796,0.330866,-1.063049,1.460884
E,-0.125381,-0.945588,2.029544,-1.046358,1.904163


#### Numpy: axis=0 = rows, axis=1 = columns

#### Pandas: axis=0 = rows, axis=1 = columns

In [41]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049
E,-0.125381,-0.945588,2.029544,-1.046358


In [42]:
df
# We dropped it but original will not change unless specified

Unnamed: 0,W,X,Y,Z,new
A,0.093628,1.240813,-1.097693,-1.908009,-1.004065
B,-0.380104,-1.666059,-2.736995,1.522562,-3.117098
C,0.178009,-0.626805,-0.391089,1.743477,-0.21308
D,1.130018,0.897796,0.330866,-1.063049,1.460884
E,-0.125381,-0.945588,2.029544,-1.046358,1.904163


In [43]:
df.drop('new',axis=1, inplace=True)

In [44]:
df

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049
E,-0.125381,-0.945588,2.029544,-1.046358


In [45]:
df.drop('E', axis=0)

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049


### Selecting Rows

In [50]:
df['A']

# Cannot get row like column and that's why have to use 'loc'

KeyError: 'A'

In [47]:
df.loc['A']

W    0.093628
X    1.240813
Y   -1.097693
Z   -1.908009
Name: A, dtype: float64

In [51]:
df.loc['B']

W   -0.380104
X   -1.666059
Y   -2.736995
Z    1.522562
Name: B, dtype: float64

In [53]:
df.iloc[2]
# Same as df.loc['B']

W    0.178009
X   -0.626805
Y   -0.391089
Z    1.743477
Name: C, dtype: float64

### Selecting subset of rows and columns

In [54]:
df.loc['B','Y']

-2.7369945956467303

In [56]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,0.093628,-1.097693
B,-0.380104,-2.736995


## Conditional Selection

In [57]:
df

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049
E,-0.125381,-0.945588,2.029544,-1.046358


In [58]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,False,False,False,True
C,True,False,False,True
D,True,True,True,False
E,False,False,True,False


In [59]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,,
B,,,,1.522562
C,0.178009,,,1.743477
D,1.130018,0.897796,0.330866,
E,,,2.029544,


In [61]:
df[df['W']>0]
# Only where 'W' > 0. Other conditions doesn't matter

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049


In [63]:
df[df['W']>0]['Y']
# give me 'Y' Where 'W'>0

A   -1.097693
C   -0.391089
D    0.330866
Name: Y, dtype: float64

In [65]:
df[df['Y']>0][['Y','X']]

Unnamed: 0,Y,X
D,0.330866,0.897796
E,2.029544,-0.945588


In [68]:
df[(df['W']>1) & (df['Y']>0)]
# first write inner condition and then add '&'

Unnamed: 0,W,X,Y,Z
D,1.130018,0.897796,0.330866,-1.063049


## More Index Details