In [1]:
import numpy as np
import pandas as pd

In [2]:
from numpy.random import randn

In [6]:
randn(3,3) # Review

array([[-0.13496359,  1.05098755,  0.99078001],
       [ 2.14060822,  0.62909772,  0.33036848],
       [ 0.27548248, -0.25036567, -0.80958185]])

In [39]:
df = pd.DataFrame(data = randn(3,3), index = ["A","B","C"], columns = ["Column1","Column2","Column3"]) 
df
# Now we created a dataframe, you can check its parameters by pressing Shift+Tab. Now we create columns too.

Unnamed: 0,Column1,Column2,Column3
A,-0.59053,1.4672,-0.85336
B,0.991193,-0.119085,0.67948
C,-0.053856,-0.185107,0.098398


#### Note: Don't forget, "A,B,C" are indices while "Column1, Column2, Column3" are columns

***
#### Let's take the values on "Column1":

In [40]:
df["Column1"]

A   -0.590530
B    0.991193
C   -0.053856
Name: Column1, dtype: float64

In [41]:
type(df["Column1"]) #As you can see its type is series, dataframes actually a union of series

pandas.core.series.Series

***
#### Let's take the values on "A":

In [42]:
#Here we cannot use a function like before instead:
df.loc["A"] # loc means "location"

Column1   -0.59053
Column2    1.46720
Column3   -0.85336
Name: A, dtype: float64

#### As you can see columns are behaving like indices now.

In [43]:
type(df.loc["A"]) # This is also a type of series

pandas.core.series.Series

***
#### What if we want to take values only in "Column1" and "Column3":

In [44]:
df[["Column1","Column3"]] # We opened another square brackets and typed the parameters

Unnamed: 0,Column1,Column3
A,-0.59053,-0.85336
B,0.991193,0.67948
C,-0.053856,0.098398


In [45]:
df["Column4"] # If we enter a parameter that doesn't exist, it returns a KeyError

KeyError: 'Column4'

***
## Adding Columns:

In [46]:
df # Our dataframe has 3 columns

Unnamed: 0,Column1,Column2,Column3
A,-0.59053,1.4672,-0.85336
B,0.991193,-0.119085,0.67948
C,-0.053856,-0.185107,0.098398


In [47]:
df["Column4"] = pd.Series(randn(3),["A","B","C"]) # Adding another column by creating a series
df

Unnamed: 0,Column1,Column2,Column3,Column4
A,-0.59053,1.4672,-0.85336,-0.517339
B,0.991193,-0.119085,0.67948,0.741623
C,-0.053856,-0.185107,0.098398,1.426818


***
#### Let's create Column5 with the sum of the values in Column1, 2 and 3:

In [64]:
df["Column5"] = df["Column1"] + df["Column2"] + df["Column3"]
df

Unnamed: 0,Column1,Column2,Column3,Column4,Column5
A,-0.59053,1.4672,-0.85336,-0.517339,0.02331
B,0.991193,-0.119085,0.67948,0.741623,1.551588
C,-0.053856,-0.185107,0.098398,1.426818,-0.140566


***
## Deleting Columns:

In [65]:
df.drop("Column5") # It returns a KerError, if you check you will see "axis" phrase at the bottom of error
#Check with Shift+Tab and you will see "axis = 0". Now think a dataframe's structure as x and y axes. x axis = 0, y axis = 1
#Columns are in "y axis" which is "axis = 1"
#Indexes are in "x axis" which is "axis = 0"

KeyError: "['Column5'] not found in axis"

In [66]:
df.drop("Column5", axis = 1) # We changed the value of axis to 1. It was 0 so program was searching the parameters in x axis and that's why program didn't find the "Column5"

Unnamed: 0,Column1,Column2,Column3,Column4
A,-0.59053,1.4672,-0.85336,-0.517339
B,0.991193,-0.119085,0.67948,0.741623
C,-0.053856,-0.185107,0.098398,1.426818


#### But there is a big problem here, when we want to print the dataframe again we'll see Column5 hasn't been deleted:

In [67]:
df

Unnamed: 0,Column1,Column2,Column3,Column4,Column5
A,-0.59053,1.4672,-0.85336,-0.517339,0.02331
B,0.991193,-0.119085,0.67948,0.741623,1.551588
C,-0.053856,-0.185107,0.098398,1.426818,-0.140566


#### That's why there one more parameter in .drop() function. If you press Shift+Tab again, you'll see "inplace = False". We should turn it True to apply the changes on dataframe

In [68]:
df.drop("Column5", axis = 1, inplace = True)  # Now the dataframe has been updated

In [69]:
df

Unnamed: 0,Column1,Column2,Column3,Column4
A,-0.59053,1.4672,-0.85336,-0.517339
B,0.991193,-0.119085,0.67948,0.741623
C,-0.053856,-0.185107,0.098398,1.426818


***
#### Nice tactics:

In [71]:
df.loc["A"] # We already did this, let's do the other way

Column1   -0.590530
Column2    1.467200
Column3   -0.853360
Column4   -0.517339
Name: A, dtype: float64

In [72]:
 df.iloc[0] # i means index so it means "index location" and "0" means the index number that's why it will show the "A" index values

Column1   -0.590530
Column2    1.467200
Column3   -0.853360
Column4   -0.517339
Name: A, dtype: float64

In [74]:
df.loc["A","Column1"] # The first value, think as the value at the intersection of "A" and "Column1"

-0.5905301551026048

In [84]:
df.loc[["A","B"],["Column1","Column2"]]

Unnamed: 0,Column1,Column2
A,-0.59053,1.4672
B,0.991193,-0.119085


In [86]:
df.loc[["A","B"],"Column1"] # 1st value: intersection of "A" and "Column1", 2nd value: intersection of "B" and "Column1"

A   -0.590530
B    0.991193
Name: Column1, dtype: float64