# Learning Pandas

#### PANDAS - Powerful data analysis tool for Python

It is an open-source, easy-to-use data structures and data analysis tool for Python. Python in Pandas is used in wide range of domains like finance, economics, academic and commercial.

###### Few of the key features of Pandas are:
- Modeling Data
- Analyzing
- Cleaning
- Manipulating
- Organizing data in Tabular or Plot form

###### Pandas Data Structure
- Data Frame
- Series 


In [2]:
#Please install first before importing
#import pandas and numpy
import pandas as pd
import numpy as np

### Panda Series

In [3]:
#How to create an Empty series of data type string
s1 = pd.Series(dtype = 'string')
s1

Series([], dtype: string)

In [4]:
#How to Create a series from ndarray
#If no indexes are passed then indexing starts from 0 to length(array) -1

data = np.array(['ab', 'bd', 'ce', 'da', 'ef'])
print(type(data))

s2 = pd.Series(data)
print(type(s2))
s2

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


0    ab
1    bd
2    ce
3    da
4    ef
dtype: object

In [5]:
#Creating a series from ndarray specifying the indexes
data = np.array(['a', 'b', 'c', 'd', 'e'])
s3 = pd.Series(data, index = ['idx1', 'idx2', 'idx3', 'idx4', 'idx5'])
s3

idx1    a
idx2    b
idx3    c
idx4    d
idx5    e
dtype: object

In [6]:
#Creating series from Dictionary
dict = {'key1': 0, 'key2': 1, 'key3': 2, 'key4': 3}
s4 = pd.Series(dict)
print(type(s4))
s4

<class 'pandas.core.series.Series'>


key1    0
key2    1
key3    2
key4    3
dtype: int64

In [7]:
#Creating Series from Scalar
scalar = pd.Series(6, index = [1,2,3,4])
scalar

1    6
2    6
3    6
4    6
dtype: int64

In [8]:
#Accessing elements
data = np.array(['a', 'b', 'c', 'd', 'e'])
s3 = pd.Series(data, index = ['idx1', 'idx2', 'idx3', 'idx4', 'idx5'])

In [9]:
s3[0]

'a'

In [10]:
s3[:3]

idx1    a
idx2    b
idx3    c
dtype: object

In [11]:
#Using indexes
s3['idx1':'idx4']

idx1    a
idx2    b
idx3    c
idx4    d
dtype: object

In [12]:
###Series Methods###

#axes
print(s3.axes)

[Index(['idx1', 'idx2', 'idx3', 'idx4', 'idx5'], dtype='object')]


In [13]:
#Size
s3.size

5

In [14]:
#head and tail
s3.head(2)

idx1    a
idx2    b
dtype: object

In [15]:
s3.tail(3)

idx3    c
idx4    d
idx5    e
dtype: object

In [16]:
#Comparison with Lists
list1 = ['a', 'b', 'c', 'd', 'e']
list2 = list1*2
list2

['a', 'b', 'c', 'd', 'e', 'a', 'b', 'c', 'd', 'e']

In [17]:
series1 = pd.Series(['a', 'b', 'c', 'd', 'e'])
series2 = series1*2
series2

0    aa
1    bb
2    cc
3    dd
4    ee
dtype: object

### Panda DataFrames

In [18]:
#Creating empty dataframe
df = pd.DataFrame()
df

In [19]:
##Creating dataframe from Lists
list1 = ['a', 'b', 'c', 'd', 'e']
df = pd.DataFrame(list1)
df

Unnamed: 0,0
0,a
1,b
2,c
3,d
4,e


In [20]:
#Creating dataframe from multiple Lists

list3 = [['ABC',10], ['XYZ', 20], ['PQR', 30]]
df = pd.DataFrame(list3)
df

Unnamed: 0,0,1
0,ABC,10
1,XYZ,20
2,PQR,30


In [21]:
#Assigning Column Names
df = pd.DataFrame(list3, columns = ['Name', 'Age'])
df

Unnamed: 0,Name,Age
0,ABC,10
1,XYZ,20
2,PQR,30


In [22]:
#Changing Data Type
df = pd.DataFrame(list3, columns = ['Name', 'Age'], dtype = float )
df

Unnamed: 0,Name,Age
0,ABC,10.0
1,XYZ,20.0
2,PQR,30.0


In [23]:
#Creating dataframe from list of dictionaries
students = {'Name': ['Andrew', 'Michael', 'Ron', 'Stephen'], 'Age': [23, 22, 21,20], 'Marks': [98, 67, 78, 89]}
df = pd.DataFrame(students)
df

Unnamed: 0,Name,Age,Marks
0,Andrew,23,98
1,Michael,22,67
2,Ron,21,78
3,Stephen,20,89


In [24]:
#Changing the indexes
df = pd.DataFrame(students, index = ['grade1', 'grade2', 'grade3', 'grade4'])
df

Unnamed: 0,Name,Age,Marks
grade1,Andrew,23,98
grade2,Michael,22,67
grade3,Ron,21,78
grade4,Stephen,20,89


In [25]:
#Rows selection - Using loc , iloc
df = pd.DataFrame(students, index = ['grade1', 'grade2', 'grade3', 'grade4'])
df
df.loc['grade1': 'grade3']

Unnamed: 0,Name,Age,Marks
grade1,Andrew,23,98
grade2,Michael,22,67
grade3,Ron,21,78


In [26]:
df.iloc[0:3]

Unnamed: 0,Name,Age,Marks
grade1,Andrew,23,98
grade2,Michael,22,67
grade3,Ron,21,78


In [27]:
#Column Selection - Using iLoc
df.iloc[:,0:2]

Unnamed: 0,Name,Age
grade1,Andrew,23
grade2,Michael,22
grade3,Ron,21
grade4,Stephen,20


In [28]:
#Deleting multiple rows
df.drop(['grade2', 'grade3'], inplace = True)
df

Unnamed: 0,Name,Age,Marks
grade1,Andrew,23,98
grade4,Stephen,20,89


In [29]:
#Adding new columns and Addition
df['Bonus']=pd.DataFrame([10,20,30,40],index=['grade1','grade2','grade3','grade4'])
df['Sum'] = df['Age'] + df['Marks'] + df['Bonus']
df

Unnamed: 0,Name,Age,Marks,Bonus,Sum
grade1,Andrew,23,98,10,131
grade4,Stephen,20,89,40,149


In [30]:
#Addition using iLoc, column wise
df['Sum'] = df.iloc[:, 0:3].sum(axis = 1)
df

Unnamed: 0,Name,Age,Marks,Bonus,Sum
grade1,Andrew,23,98,10,121
grade4,Stephen,20,89,40,109


In [31]:
#Copy DataFrame
df1 = df.copy()
df1

Unnamed: 0,Name,Age,Marks,Bonus,Sum
grade1,Andrew,23,98,10,121
grade4,Stephen,20,89,40,109


In [32]:
#Deleting columns
del(df1['Name'])
df1

Unnamed: 0,Age,Marks,Bonus,Sum
grade1,23,98,10,121
grade4,20,89,40,109


In [33]:
df1.pop('Age')
df1

Unnamed: 0,Marks,Bonus,Sum
grade1,98,10,121
grade4,89,40,109


In [34]:
df

Unnamed: 0,Name,Age,Marks,Bonus,Sum
grade1,Andrew,23,98,10,121
grade4,Stephen,20,89,40,109


In [35]:
#Reordering of Columns
df = df[['Marks', 'Bonus', 'Sum', 'Age', 'Name']]
df

Unnamed: 0,Marks,Bonus,Sum,Age,Name
grade1,98,10,121,23,Andrew
grade4,89,40,109,20,Stephen


In [36]:
#Rearrnging columns using indexs
cols = list(df.columns.values)
df1 = df[cols[0:2]+[cols[-1]]+cols[2:4]]
df1

Unnamed: 0,Marks,Bonus,Name,Sum,Age
grade1,98,10,Andrew,121,23
grade4,89,40,Stephen,109,20


In [37]:
#Filtering Data
df.loc[df['Name'] == 'Ron']

Unnamed: 0,Marks,Bonus,Sum,Age,Name


In [38]:
df.loc[(df['Sum'] > 85) & (df['Sum'] < 115)]

Unnamed: 0,Marks,Bonus,Sum,Age,Name
grade4,89,40,109,20,Stephen


In [39]:
#Methods
dict1 = {'Name': pd.Series(['Martin', 'Jack', 'Mary', 'David', 'Stephen', 'Shannon', 'Angela', 'Stephanie', 'Philip', 'Michael']), 
        'Age': pd.Series([34,23,56,34,27,43,33,54,44,41]), 'Rating': pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80])}

dfg = pd.DataFrame(dict1)
dfg

Unnamed: 0,Name,Age,Rating
0,Martin,34,4.23
1,Jack,23,3.24
2,Mary,56,3.98
3,David,34,2.56
4,Stephen,27,3.2
5,Shannon,43,4.6
6,Angela,33,3.8
7,Stephanie,54,3.78
8,Philip,44,2.98
9,Michael,41,4.8


In [41]:
#Transpose
dfg.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Name,Martin,Jack,Mary,David,Stephen,Shannon,Angela,Stephanie,Philip,Michael
Age,34,23,56,34,27,43,33,54,44,41
Rating,4.23,3.24,3.98,2.56,3.2,4.6,3.8,3.78,2.98,4.8


In [43]:
#axes
dfg.axes

[RangeIndex(start=0, stop=10, step=1),
 Index(['Name', 'Age', 'Rating'], dtype='object')]

In [45]:
#datatypes
dfg.dtypes

Name       object
Age         int64
Rating    float64
dtype: object

In [46]:
#shapes
dfg.shape

(10, 3)

In [47]:
#Size
dfg.size

30

In [48]:
#Values
dfg.values

array([['Martin', 34, 4.23],
       ['Jack', 23, 3.24],
       ['Mary', 56, 3.98],
       ['David', 34, 2.56],
       ['Stephen', 27, 3.2],
       ['Shannon', 43, 4.6],
       ['Angela', 33, 3.8],
       ['Stephanie', 54, 3.78],
       ['Philip', 44, 2.98],
       ['Michael', 41, 4.8]], dtype=object)

In [63]:
#sum, mean, std, count
dfg.sum()

Name      MartinJackMaryDavidStephenShannonAngelaStephan...
Age                                                     389
Rating                                                37.17
dtype: object

In [53]:
dfg.sum(axis = 1)  #(axis = 0 means row and axis = 1 means column)

0    38.23
1    26.24
2    59.98
3    36.56
4    30.20
5    47.60
6    36.80
7    57.78
8    46.98
9    45.80
dtype: float64

In [50]:
dfg.mean()

Age       38.900
Rating     3.717
dtype: float64

In [51]:
dfg.std()

Age       10.774971
Rating     0.720525
dtype: float64

In [52]:
dfg.count(axis = 1)

0    3
1    3
2    3
3    3
4    3
5    3
6    3
7    3
8    3
9    3
dtype: int64

In [54]:
#Data Summary - shows summary of statistics
dfg.describe()

Unnamed: 0,Age,Rating
count,10.0,10.0
mean,38.9,3.717
std,10.774971,0.720525
min,23.0,2.56
25%,33.25,3.21
50%,37.5,3.79
75%,43.75,4.1675
max,56.0,4.8


In [55]:
#Sorting
#by Index
sorted_dfg = dfg.sort_index()
sorted_dfg

Unnamed: 0,Name,Age,Rating
0,Martin,34,4.23
1,Jack,23,3.24
2,Mary,56,3.98
3,David,34,2.56
4,Stephen,27,3.2
5,Shannon,43,4.6
6,Angela,33,3.8
7,Stephanie,54,3.78
8,Philip,44,2.98
9,Michael,41,4.8


In [56]:
sorted_dfg = dfg.sort_index(ascending = False)
sorted_dfg

Unnamed: 0,Name,Age,Rating
9,Michael,41,4.8
8,Philip,44,2.98
7,Stephanie,54,3.78
6,Angela,33,3.8
5,Shannon,43,4.6
4,Stephen,27,3.2
3,David,34,2.56
2,Mary,56,3.98
1,Jack,23,3.24
0,Martin,34,4.23


In [58]:
#Sorting by Values
sorted_age= dfg.sort_values('Age')
sorted_age

Unnamed: 0,Name,Age,Rating
1,Jack,23,3.24
4,Stephen,27,3.2
6,Angela,33,3.8
0,Martin,34,4.23
3,David,34,2.56
9,Michael,41,4.8
5,Shannon,43,4.6
8,Philip,44,2.98
7,Stephanie,54,3.78
2,Mary,56,3.98


In [86]:
#Dataframe with missing values
raw_data = {'first_name': ['Martin', 'Jack', np.nan, 'David', 'Stephen'], 
            'last_name': ['Duston', np.nan, 'Miller', 'Ross', 'Still'],
            'age': [42, 36, np.nan, 24, 73], 
            'sex': ['m', np.nan, np.nan, 'f', np.nan],
            'preTestScore': [4, np.nan, 2, np.nan,  3],
            'postTestScore': [25, np.nan, np.nan, 62, 70]}

df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])

In [87]:
#identifying the missing values
df.isnull()

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,False,False,False,False,False,False
1,False,True,False,True,True,True
2,True,False,True,True,False,True
3,False,False,False,False,True,False
4,False,False,False,True,False,False


In [88]:
df.isnull().sum()

first_name       1
last_name        1
age              1
sex              3
preTestScore     2
postTestScore    2
dtype: int64

In [89]:
#Drop rows with missing values
df_drop = df.dropna()
df_drop

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Martin,Duston,42.0,m,4.0,25.0


In [90]:
#Create a new column full of missing values
df['new'] = np.nan
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,new
0,Martin,Duston,42.0,m,4.0,25.0,
1,Jack,,36.0,,,,
2,,Miller,,,2.0,,
3,David,Ross,24.0,f,,62.0,
4,Stephen,Still,73.0,,3.0,70.0,


In [91]:
#Drop column if they only contain missing values
df.dropna(axis=1,how="all",inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Martin,Duston,42.0,m,4.0,25.0
1,Jack,,36.0,,,
2,,Miller,,,2.0,
3,David,Ross,24.0,f,,62.0
4,Stephen,Still,73.0,,3.0,70.0


In [92]:
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Martin,Duston,42.0,m,4.0,25.0
1,Jack,,36.0,,,
2,,Miller,,,2.0,
3,David,Ross,24.0,f,,62.0
4,Stephen,Still,73.0,,3.0,70.0


In [93]:
#Fill in missing data with Zeros
df.fillna(0)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Martin,Duston,42.0,m,4.0,25.0
1,Jack,0,36.0,0,0.0,0.0
2,0,Miller,0.0,0,2.0,0.0
3,David,Ross,24.0,f,0.0,62.0
4,Stephen,Still,73.0,0,3.0,70.0


In [95]:
#Fill in missing in specific col with the mean value of that col
df["preTestScore"].fillna(df["preTestScore"].mean(),inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Martin,Duston,42.0,m,4.0,25.0
1,Jack,,36.0,,3.0,
2,,Miller,,,2.0,
3,David,Ross,24.0,f,3.0,62.0
4,Stephen,Still,73.0,,3.0,70.0


In [96]:
# Select the rows of df where age is not NaN and last_name is not NaN
df[df['age'].notnull() & df['last_name'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Martin,Duston,42.0,m,4.0,25.0
3,David,Ross,24.0,f,3.0,62.0
4,Stephen,Still,73.0,,3.0,70.0
