In [1]:
import numpy as np
import pandas as pd

In [2]:
from numpy.random import randn

In [10]:
np.random.seed(101)

In [44]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z']) #I can use randn directly instead of np.randn because of the import I did above

In [45]:
df  #each column is a pandas Series. A DataFrame is just a bunch of Series that share an index

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [46]:
df['W'] #this indexing output the W column/series

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [47]:
type(df['W'])

pandas.core.series.Series

In [48]:
type(df)

pandas.core.frame.DataFrame

In [49]:
df.W #this is similar to SQL querying, table.column_name. But don't use this as what comes after the . can be confused
     #as being a method even though its a column name in this case

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [50]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [51]:
df[['W','Z']] #to get multiple columns pass in a list with the column names into the index paranthesis
              #notice when asking for mulitple colums a DataFrame is the output instead of a series which is the output
              #if only requesting one column

Unnamed: 0,W,Z
A,0.302665,-1.159119
B,-0.134841,0.184502
C,0.807706,0.329646
D,-0.497104,0.484752
E,-0.116773,1.996652


In [52]:
#creating a new column

In [53]:
df['New'] = df['W'] + df['Y'] #this creates a new column inplace

In [54]:
df

Unnamed: 0,W,X,Y,Z,New
A,0.302665,1.693723,-1.706086,-1.159119,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,0.032064
C,0.807706,0.07296,0.638787,0.329646,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,0.121354


In [55]:
#to remove a column use df.drop()

In [56]:
 df.drop('New',axis = 1) #by default the axis is zero so must specify axis = 1 if removing a column
                         # .drop() does not take effect in place 

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [57]:
df #new is still there. Pandas often requires you to set the inplace argument to True if you want it to work in place

Unnamed: 0,W,X,Y,Z,New
A,0.302665,1.693723,-1.706086,-1.159119,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,0.032064
C,0.807706,0.07296,0.638787,0.329646,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,0.121354


In [58]:
df.drop('New',axis = 1,inplace = True)

In [59]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [60]:
#to drop rows

In [61]:
df.drop('E') #by default axis is 0.

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [62]:
#dataframes are like fancy index markers on top of a numpy array

In [63]:
df.shape #rows,columns. This is why rows are referred to as the 0 axis 
         #and columns the 1 axis. See index position in the tuple

(5, 4)

In [64]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [65]:
#selecting rows

In [67]:
df.loc['C'] #the loc method has square brackets. This is sometimes how pandas works
            #this returns the row as a series

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

In [68]:
df.iloc[2] # .iloc method returns the row using the index position of the row

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

In [69]:
#selecting subsets

In [70]:
df.loc['B','Y']  #row,column

0.16690463609281317

In [73]:
df.loc[['A','B'],['W','Y']] #pass in a list of rows I want and a seperate list of columns.
                            #notice that it returns exactly what I ask for and not one before the last row I grab which
                            #happens when slicing an array or a list for example

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905
