# ------------    dataframe basics    ------------
    # we mainly work with "dataframes" in pandas
    # Dataframes are built from Series objects

In [1]:
import numpy as np
import pandas as pd

In [2]:
# retun random numbers from "standared NORMAL distribution" centered around 0
from numpy.random import randn

In [3]:
# we're going to use np.random.seed(), so that we'll get a same random values
    # np.random.seed() is a function in NumPy that sets the seed for the random number generator.
    # Setting the "seed" to a specific value makes the random number generation predictable
    # Every time you run this code with the seed set to 101, the same sequence of random numbers will be generated.
np.random.seed(101)
rnd_20 = randn(5, 4)
rnd_20

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

___

## --------    Building a Dataframe    --------
    # similar to Series(), DataFrame() also takes, "data", "index" as arguments
    # there ia also a "columns" argumnet for columns
    # note that: "index" also represents the rows


In [4]:
fd = pd.DataFrame(data=rnd_20, index=["r1", "r2", "r3", "r4", "r5"], columns=["c1", "c2", "c3", "c4"])
# randn(5, 4) generates 5x4 matrix of 20 random numbers from "NORMAL distribution"
    # that's why we've used "5-index for 5-rows" and 4-columns
fd

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r2,0.651118,-0.319318,-0.848077,0.605965
r3,-2.018168,0.740122,0.528813,-0.589001
r4,0.188695,-0.758872,-0.933237,0.955057
r5,0.190794,1.978757,2.605967,0.683509


In [5]:
# Each column is a "Pandas-Series", so, "c1" is a Series, as well as "c2", "c3", "c4"
# and they all share a "Common-index"

# Basically all DataFrames is a bunch of series that shares a same index
# we can select these Series-objects

### --------    accessing Series from DataFrame    --------
#### method 1: specify the column name


In [6]:
fd['c3']

r1    0.907969
r2   -0.848077
r3    0.528813
r4   -0.933237
r5    2.605967
Name: c3, dtype: float64

In [7]:
# it is actually a Series, we can confirm it by type checking
type(fd['c3'])

pandas.core.series.Series

In [8]:
# but the type of 'fd' is DataFrame
type(fd)

pandas.core.frame.DataFrame

#### method 2: SQL-format, useing "."
    # however it's good to use []-format, 
    # if we use'.', we can get confused with built-in methods (e.g. last, ilk, loc)
    # using SQL-format ".", can overrite those built-in methods with "column-names"
    # so pandas get confused

In [9]:
print(fd.c3)

r1    0.907969
r2   -0.848077
r3    0.528813
r4   -0.933237
r5    2.605967
Name: c3, dtype: float64


#### multiple columns: use list of column-names
    # we'll get a DataFrame instead of Series
    # single column: Series, multi-column: DataFrame

In [10]:
fd[['c2', 'c4']]

Unnamed: 0,c2,c4
r1,0.628133,0.503826
r2,-0.319318,0.605965
r3,0.740122,-0.589001
r4,-0.758872,0.955057
r5,1.978757,0.683509


In [11]:
type(fd[['c2', 'c4']])

pandas.core.frame.DataFrame

#### creating a new column

In [12]:
fd['new1'] = [113, 123, 133, 143, 153]

In [14]:
# or we can use operations
fd['new2'] = fd['c1'] + fd['c2']

In [15]:
fd

Unnamed: 0,c1,c2,c3,c4,new1,new2
r1,2.70685,0.628133,0.907969,0.503826,113,3.334983
r2,0.651118,-0.319318,-0.848077,0.605965,123,0.3318
r3,-2.018168,0.740122,0.528813,-0.589001,133,-1.278046
r4,0.188695,-0.758872,-0.933237,0.955057,143,-0.570177
r5,0.190794,1.978757,2.605967,0.683509,153,2.169552


#### DELETING a column


In [17]:
# axis: 0 is row, 1 is column
fd.drop('new1', axis=1)

Unnamed: 0,c1,c2,c3,c4,new2
r1,2.70685,0.628133,0.907969,0.503826,3.334983
r2,0.651118,-0.319318,-0.848077,0.605965,0.3318
r3,-2.018168,0.740122,0.528813,-0.589001,-1.278046
r4,0.188695,-0.758872,-0.933237,0.955057,-0.570177
r5,0.190794,1.978757,2.605967,0.683509,2.169552


In [19]:
# Note: original dataframe not affected, i.e. the column is not deleted
    # not to accidentally lose information, it's done for safety
    # inplace=True, actually occur the changes in-place (to actually delete the data)
print(fd)
fd.drop('new1', axis=1, inplace=True)
print(fd)

          c1        c2        c3        c4  new1      new2
r1  2.706850  0.628133  0.907969  0.503826   113  3.334983
r2  0.651118 -0.319318 -0.848077  0.605965   123  0.331800
r3 -2.018168  0.740122  0.528813 -0.589001   133 -1.278046
r4  0.188695 -0.758872 -0.933237  0.955057   143 -0.570177
r5  0.190794  1.978757  2.605967  0.683509   153  2.169552
          c1        c2        c3        c4      new2
r1  2.706850  0.628133  0.907969  0.503826  3.334983
r2  0.651118 -0.319318 -0.848077  0.605965  0.331800
r3 -2.018168  0.740122  0.528813 -0.589001 -1.278046
r4  0.188695 -0.758872 -0.933237  0.955057 -0.570177
r5  0.190794  1.978757  2.605967  0.683509  2.169552


In [20]:
# drop a row
fd.drop('r2')   # by default axis=0
# fd.drop('r2', axis=0)

Unnamed: 0,c1,c2,c3,c4,new2
r1,2.70685,0.628133,0.907969,0.503826,3.334983
r3,-2.018168,0.740122,0.528813,-0.589001,-1.278046
r4,0.188695,-0.758872,-0.933237,0.955057,-0.570177
r5,0.190794,1.978757,2.605967,0.683509,2.169552


#### why axis=0 is row, and axis=1 is column?


In [22]:
# if we notice the shape of the DataFrame
fd.shape    # (row, column)
# (row, column) is a tuple, row at index 0, and column is index 1


(5, 5)

### --------  selecting rows: loc[] and iloc[]  --------
    # The "single row" of a DataFrame is also a "Series"
    # There is two ways to select rows in a DataFrame
        # loc[] and iloc[] are methods of pandas, but notice we're using "[]" instead of "()"

#### loc['label_of_row']   :   label based index
    # notice we have to use "labels" of the row

In [5]:
fd.loc['r2']    # accessing row 2 

c1    0.651118
c2   -0.319318
c3   -0.848077
c4    0.605965
Name: r2, dtype: float64

#### iloc[index_of_row]    :   numeric based index
    # notice in this case we use numerical "index" even if our rows are indexed by strings

In [7]:
fd.iloc[1]    # accessing row 2 

c1    0.651118
c2   -0.319318
c3   -0.848077
c4    0.605965
Name: r2, dtype: float64

___

## ------------    subsetting: sub-sets of rows and columns    ------------
    # use loc[row, column]
    # "row", "column" can be lables or 'list of labels'

In [5]:
fd

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r2,0.651118,-0.319318,-0.848077,0.605965
r3,-2.018168,0.740122,0.528813,-0.589001
r4,0.188695,-0.758872,-0.933237,0.955057
r5,0.190794,1.978757,2.605967,0.683509


In [6]:
# single value at row "r2" and at column "c3"
fd.loc['r2', 'c3']

-0.8480769834036315

In [7]:
# subset of valuees: use lists of "row", "column"
    #  getting values from r1, r3 rows and c2, c4 columns
fd.loc[['r1', 'r3'], ['c2', 'c4']]

Unnamed: 0,c2,c4
r1,0.628133,0.503826
r3,0.740122,-0.589001


___

## ------    conditional selections    ------
    # we can apply conditional selection using "[]" brackets notation in pandas
    # it is very similar to NumPy

In [5]:
# Boolian DataFrame, similar to NumPy-array
fd > 0

Unnamed: 0,c1,c2,c3,c4
r1,True,True,True,True
r2,True,False,False,True
r3,False,True,True,False
r4,True,False,False,True
r5,True,True,True,True


In [6]:
# use boolian DataFrame for selection
booLdf = fd > 0

In [7]:
# filter / selecting
    # values for True and NaN for False
fd[booLdf]

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r2,0.651118,,,0.605965
r3,,0.740122,0.528813,
r4,0.188695,,,0.955057
r5,0.190794,1.978757,2.605967,0.683509


In [8]:
# we can do this in a single line
fd[fd > 0]

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r2,0.651118,,,0.605965
r3,,0.740122,0.528813,
r4,0.188695,,,0.955057
r5,0.190794,1.978757,2.605967,0.683509


In [None]:
# ------   avoiding "NaN"   ------
# instead of using conditions like "fd > 0", we'll use row/column values eg. fd['r1'] > 0
    # it'll only returns the rows / columns as the subset of DataFrame where condtions are true
    # where 'fd' is the whole dataset and 
    # fd['r1'] is a row

# "fd > 0" : condition for entire DataFrame
# "fd['r1'] > 0" : condition for a row

# Using fd[fd['r1'] > 0] will return a subset of the dataframe containing 
    # only the rows where the condition is true, instead of returning NaN.

In [6]:
fd

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r2,0.651118,-0.319318,-0.848077,0.605965
r3,-2.018168,0.740122,0.528813,-0.589001
r4,0.188695,-0.758872,-0.933237,0.955057
r5,0.190794,1.978757,2.605967,0.683509


#### using column

In [5]:
fd['c2'] > 0    # condition for column c2

r1     True
r2    False
r3     True
r4    False
r5     True
Name: c2, dtype: bool

In [7]:
# filtering the DataFrame fd with condition "fd['c2'] > 0"
fd[fd['c2'] > 0]

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r3,-2.018168,0.740122,0.528813,-0.589001
r5,0.190794,1.978757,2.605967,0.683509


#### using row

In [14]:
# similarly we can filter "fd" for a row
fd.loc['r3'] > 0    # condition for row r3

c1    False
c2     True
c3     True
c4    False
Name: r3, dtype: bool

In [16]:
# filtering the DataFrame fd with condition "fd.loc['r3'] > 0"
fd.loc[:, fd.loc['r3'] > 0]

Unnamed: 0,c2,c3
r1,0.628133,0.907969
r2,-0.319318,-0.848077
r3,0.740122,0.528813
r4,-0.758872,-0.933237
r5,1.978757,2.605967


In [5]:
# get all the rows where 'c4' < 0
# it returns the row 'r3', because it has -ve value in column 'c4'
fd[fd['c4']<0]

Unnamed: 0,c1,c2,c3,c4
r3,-2.018168,0.740122,0.528813,-0.589001


#### work with resulting DataFrame, after filtering

In [9]:
reslt  = fd[fd['c2'] > 0]
reslt

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r3,-2.018168,0.740122,0.528813,-0.589001
r5,0.190794,1.978757,2.605967,0.683509


In [10]:
# get result's 2nd row
reslt.loc['r3']     # because r2 and r4 are eliminated in "reslt"

c1   -2.018168
c2    0.740122
c3    0.528813
c4   -0.589001
Name: r3, dtype: float64

In [12]:
# result's 1st column
reslt['c1']

r1    2.706850
r3   -2.018168
r5    0.190794
Name: c1, dtype: float64

In [13]:
# we can do it in a lingle line
fd[fd['c2'] > 0]['c1']

r1    2.706850
r3   -2.018168
r5    0.190794
Name: c1, dtype: float64

In [14]:
# getting multiple columns
fd[fd['c2'] > 0][['c2', 'c4']]

Unnamed: 0,c2,c4
r1,0.628133,0.503826
r3,0.740122,-0.589001
r5,1.978757,0.683509


In [15]:
# getting 2nd row
fd[fd['c2'] > 0].iloc[1]

c1   -2.018168
c2    0.740122
c3    0.528813
c4   -0.589001
Name: r3, dtype: float64

#### "fd[fd['c2'] > 0][['c2', 'c4']]" is equivalent of following lines togather
    # since more variables use more memory, "fd[fd['c2'] > 0][['c2', 'c4']]" recommended

In [16]:
boolser = fd['c2'] > 0 
result = fd[boolser] 
mycols = ['c2', 'c4'] 
result[mycols]

Unnamed: 0,c2,c4
r1,0.628133,0.503826
r3,0.740122,-0.589001
r5,1.978757,0.683509


### --------  multiple conditions  --------
    use bitwise "&", "|" instead of "and", "or"

In [None]:
# following will return an error
fd[(fd['c2'] > 0) and (fd['c4'] > 1)]   # ERR
# Why error? The truth value of a Series is ambiguous
    # because python normal "and" operator can't compare a "Series of Boolean" to another "Series of Boolean"
    # "and" operator deal with single Boolean value at a time. E.g. True and True, True and False

In [6]:
# Use bitwise-AND operator "&"
    # always use () for seperations between conditions
fd[(fd['c1'] > 0) & (fd['c3'] > 1)]   # Notice "&" instead of "and"

Unnamed: 0,c1,c2,c3,c4
r5,0.190794,1.978757,2.605967,0.683509


In [7]:
# similarly we can use bitwise-OR operator "|"
fd[(fd['c1'] > 0) | (fd['c3'] > 1)]   # Notice "|" instead of "or"

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r2,0.651118,-0.319318,-0.848077,0.605965
r4,0.188695,-0.758872,-0.933237,0.955057
r5,0.190794,1.978757,2.605967,0.683509
