In [1]:
import pandas as pd
import numpy as np

In [2]:
#A series is a 1d labelled array.
#Create series using pd.Series(data, index)

series = pd.Series(data = [1, 2, 3], index = ['A', 'B', 'C'])
print(series)
print(type(series))

series2 = pd.Series([1, 2, 3], ['A', 'B', 'C'])
print(series2)
print(type(series2))

A    1
B    2
C    3
dtype: int64
<class 'pandas.core.series.Series'>
A    1
B    2
C    3
dtype: int64
<class 'pandas.core.series.Series'>


In [3]:
#Now we can access array element using 2 different ways:
print(f"Acessing using series[0]: {series[0]}")
print(f"Accessing using series['A']: {series['A']}")

Acessing using series[0]: 1
Accessing using series['A']: 1


In [7]:
#Data in the series can be passed as either:
# A python list
# A numpy array
# A python dictionary

#Python list
series5 = pd.Series(data = [1, 2, 3], index = ['A', 'B', 'C'])
print(series5)

#Numpy array
print("From Numpy array")
series4 = pd.Series(data = np.array([1, 2, 3]), index = ['A', 'B', 'C'])
print(series4)

#Python Dictionary
print("From Python Dictionary")
dict_ = {'A' : 1, 'B' : 2, 'C' : 3}
print(type(dict_))

series3 = pd.Series(dict_)
print(series3)
print(type(series3))

A    1
B    2
C    3
dtype: int64
From Numpy array
A    1
B    2
C    3
dtype: int32
From Python Dictionary
<class 'dict'>
A    1
B    2
C    3
dtype: int64
<class 'pandas.core.series.Series'>


In [8]:
#If there are no label, series will default to [0, n) where n is the number of data values
series6 = pd.Series(data = [1, 2, 3])
print(series6)

0    1
1    2
2    3
dtype: int64


In [11]:
#We can perform operation on series (done base on index)

week_one = pd.Series(data=[100, 50, 300], index=['Bob', 'Sally', 'Jess'])
print(week_one)


week_two = pd.Series(data=[500, 30, 20], index=['Bob', 'Sally', 'Jess'])
print(week_two)

total = week_one + week_two
print(total)

Bob      100
Sally     50
Jess     300
dtype: int64
Bob      500
Sally     30
Jess      20
dtype: int64
Bob      600
Sally     80
Jess     320
dtype: int64


In [12]:
week_one = pd.Series(data=[100, 50, 300], index=['Bob', 'Peter', 'Jess'])
print(week_one)


week_two = pd.Series(data=[500, 30, 20], index=['Bob', 'Sally', 'Jess'])
print(week_two)

#Can perform operation if both don't have exactly the same column label but will result in NaN for the missing column 
total2 = week_one + week_two
print(total2)

Bob      100
Peter     50
Jess     300
dtype: int64
Bob      500
Sally     30
Jess      20
dtype: int64
Bob      600.0
Jess     320.0
Peter      NaN
Sally      NaN
dtype: float64


In [13]:
#Data Frame is a 2d labeled data structure with columns of potentially differnt type. Essentially a spreadsheet
# pd.DataFrame(data, index, columns) columns are an additional way to access individual series/column

df = pd.DataFrame(data = np.arange(0, 20).reshape(4, 5), index = ['A', 'B', 'C', 'D'], columns = ['col1', 'col2', 'col3', 'col4', 'col5'])
print(df)
print(type(df))

   col1  col2  col3  col4  col5
A     0     1     2     3     4
B     5     6     7     8     9
C    10    11    12    13    14
D    15    16    17    18    19
<class 'pandas.core.frame.DataFrame'>


In [14]:
print(df['col3'])
print(type(df['col3']))

A     2
B     7
C    12
D    17
Name: col3, dtype: int32
<class 'pandas.core.series.Series'>


Unnamed: 0,col1,col2,col3,col4,col5
A,0,1,2,3,4
B,5,6,7,8,9
C,10,11,12,13,14
D,15,16,17,18,19


In [16]:
#Same as before default for both series and column will be [0, n)
#P.S Data Frame is visually better without print statement in Jupyter Notebook
my_df = pd.DataFrame(data=np.arange(0,20).reshape(4,5))
my_df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [18]:
my_df = pd.DataFrame(data=np.arange(0,20).reshape(4,5), index=['A', 'B', 'C', 'D'], columns=['col1', 'col2', 'col3', 'col4', 'col5'])
#retrieve list of columns
my_df[['col2', 'col3']]

Unnamed: 0,col2,col3
A,1,2
B,6,7
C,11,12
D,16,17


In [19]:
#To access row information: specify the index location using .iloc, or the index name/label using .loc
my_df.iloc[0]

col1    0
col2    1
col3    2
col4    3
col5    4
Name: A, dtype: int32

In [20]:
my_df.loc['A']

col1    0
col2    1
col3    2
col4    3
col5    4
Name: A, dtype: int32

In [22]:
#look up bunch of rows and columns
my_df.loc['B':'D', 'col1':'col3']

Unnamed: 0,col1,col2,col3
B,5,6,7
C,10,11,12
D,15,16,17


In [23]:
#Since we built the Data Frame from Numpy arrays, we can use the same technique of selecting elements based off some condtions
my_df % 2 == 0

Unnamed: 0,col1,col2,col3,col4,col5
A,True,False,True,False,True
B,False,True,False,True,False
C,True,False,True,False,True
D,False,True,False,True,False


In [24]:
#Select the values that meet the conditions. The one that does not will be replaced with NaN
my_df[my_df % 2 == 0]

Unnamed: 0,col1,col2,col3,col4,col5
A,0.0,,2.0,,4.0
B,,6.0,,8.0,
C,10.0,,12.0,,14.0
D,,16.0,,18.0,


In [25]:
#To fill in NaN values use fillna(value)
my_df[my_df & 2 == 0].fillna(value = 0)

Unnamed: 0,col1,col2,col3,col4,col5
A,0.0,1.0,0.0,0.0,4.0
B,5.0,0.0,0.0,8.0,9.0
C,0.0,0.0,12.0,13.0,0.0
D,0.0,16.0,17.0,0.0,0.0


In [26]:
#filling all NaN values with whatever the mean of my_df's original col2 is: ﴾1+6+11+16﴿/4 = 8.5
my_df[my_df % 2 == 0].fillna(value=my_df['col2'].mean())


Unnamed: 0,col1,col2,col3,col4,col5
A,0.0,8.5,2.0,8.5,4.0
B,8.5,6.0,8.5,8.0,8.5
C,10.0,8.5,12.0,8.5,14.0
D,8.5,16.0,8.5,18.0,8.5


In [27]:
#To add new columns to Data Frame
my_df['newCol'] = [10, 20, 30, 40] #must have the same amount of rows
my_df

Unnamed: 0,col1,col2,col3,col4,col5,newCol
A,0,1,2,3,4,10
B,5,6,7,8,9,20
C,10,11,12,13,14,30
D,15,16,17,18,19,40


In [28]:
my_df['col1 + col2'] = my_df['col1'] + my_df['col2']
my_df

Unnamed: 0,col1,col2,col3,col4,col5,newCol,col1 + col2
A,0,1,2,3,4,10,1
B,5,6,7,8,9,20,11
C,10,11,12,13,14,30,21
D,15,16,17,18,19,40,31


In [31]:
# To drop columns
my_df.drop(columns = ['newCol']) #This is done in place so changes are not stored unless specified to a new value
testing = my_df.drop(columns = ['newCol'])
testing

Unnamed: 0,col1,col2,col3,col4,col5,col1 + col2
A,0,1,2,3,4,1
B,5,6,7,8,9,11
C,10,11,12,13,14,21
D,15,16,17,18,19,31


In [32]:
# Or we can use additional parameter inplace = True to save the changes to the same Data Frame
my_df.drop(columns = ['newCol', 'col1 + col2'], inplace = True)
my_df

Unnamed: 0,col1,col2,col3,col4,col5
A,0,1,2,3,4
B,5,6,7,8,9
C,10,11,12,13,14
D,15,16,17,18,19
