### Some jottings in my notebook

#### DataFrames

In [1]:
import numpy as np
import pandas as pd

In [2]:
from numpy.random import randn

In [3]:
np.random.seed(101)#This makes sure the same set of random
#numbers are generated all the time.

In [4]:
#create a sample dataframe
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'], 
                 ['W','X','Y','Z'])
#The code above creates a dataframe of random standard
#normalized numbers in a 5 by 4 matrix(5 rows,4 columns).
#The 1st list contains the specified labels for the rows
#The second contains labels for the columns

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [8]:
#Using indexing to grab rows or columns
#This will return a series of all values in column W
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [11]:
#This returns the value at column W, row A
df['W']['A']

2.706849839399938

In [14]:
#this returns columns specified
df[['W','X']]

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


In [20]:
#I can add a new column to a dataframe like below
df['new'] = df['W'] + df['X'] #no need to add all 
#the existing columns.

In [21]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [22]:
#to drop a column
df.drop('new',axis=1, inplace=True)
#The drop method accepts the column/row name, 
#the axis either 0/1 to specify column or row
#the inplace value True/False. If it's true,
#Pandas will remove it permanently from the dataframe



In [23]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [25]:
#Accessing rows
#This can be done in either of two ways
df.loc['C']
#The way above uses a pandas method loc which is 
#short for location, the I will pass the row label

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [26]:
df.iloc[0]
#The way above uses a pandas method iloc which is short
#for index location. It then accepts the numerical
#value for the index position of the row.
#Above example grabs the values in row with index 0
#which is the row with label A

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [27]:
df.loc['B','Y'] #This returns the value in row B,column Y

-0.8480769834036315

In [29]:
df.loc[['A', 'B'], ['W', 'Y']]
#This returns the values in rows A, B and columns W, Y

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


In [30]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [31]:
#The blocks of code below is called conditional selection
booldf = df > 0
df[booldf]
#This returns a dataframe with values where the condition
#is true, if its false it puts NaN

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [32]:
#Faster way of writing the above code in cell 31
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [33]:
#Using conditional selection on a row or coluimn
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [34]:
df[df['Z'] < 0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [36]:
df[df['W'] > 0]['X'] #this returns the column 'X' of the
#dataframe where value of column 'W' is greater than 0

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [37]:
df[df['W'] > 0][['Y','X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [40]:
#Setting more than one conditions
df[(df['X'] > 0) & (df['Y'] > 1)]
#Both conditions must be true

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [42]:
df[(df['X'] > 0) | (df['Y'] > 1)]
#Either condition should be true

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [43]:
#Reset dataframe index
df.reset_index() 
#If I want this action to be permanent, just like .drop
#I should set inplace=True inside the brackets.

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [44]:
#I can add a new column with values like below
newindex = 'CA NY WY OR CO'.split()
#Here I use the split method to grenerate a list of five
# values from the string and set it to a variable
newindex

['CA', 'NY', 'WY', 'OR', 'CO']

In [45]:
df['States'] = newindex 
# Here I create a new column States and set the values
# to the elements in the newindex list.
# This runs witout any error since the number of elements
# in the list corresponds to that of the rows in the
# dataframe

In [46]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [47]:
#I can set a column as the index
df.set_index('States') #I av to use the inplace to retain
# this permanently else df still stays dsame if ran again

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509
