# Pandas for Python : Basics of DataFrames and Series

Open source data manipulation and analysis library. Useful for structured data. Offer 2 primary data structures - Data Frames and Series 

1. Data Frames - 2D, size mutable, can store data of different types. More like a table with rows and columns 

2. Series - 1D array, single column or row of data. 

### Data Frames 

**class pandas.Dataframe(data=None, index=None, columns=None, dtype=None, copy=None)** 

In [3]:
# Dataframe from a dictionary 

import pandas as pd 

d = {"Name":["Rubina","Monica","Mac","Ajay"],"Age":[28,35,22,40]}
df = pd.DataFrame(data=d)
df 

Unnamed: 0,Name,Age
0,Rubina,28
1,Monica,35
2,Mac,22
3,Ajay,40


In [5]:
df.dtypes

Name    object
Age      int64
dtype: object

In [16]:
#Constructing data frame from a series in a dictionary

d1 = {"Name":["Joey","Phoebe","Monica","Chandler","Ross","Rachel"], "Marriage satus": pd.Series([1,1,1,3,1], index=range(1,6))}
df1 = pd.DataFrame(data=d1,index=range(6))
df1

Unnamed: 0,Name,Marriage satus
0,Joey,
1,Phoebe,1.0
2,Monica,1.0
3,Chandler,1.0
4,Ross,3.0
5,Rachel,1.0


In [99]:
df1[['Name']]

Unnamed: 0,Name
0,Joey
1,Phoebe
2,Monica
3,Chandler
4,Ross
5,Rachel


In [20]:
# Constructing dataframe using ndarray from numpy 
import numpy as np 
df2 = pd.DataFrame(np.array([[1,2,3],[4,5,6],[0,4,3]]), index=['r1','r2','r3'], columns = ['c1','c2','c3'])
df2

Unnamed: 0,c1,c2,c3
r1,1,2,3
r2,4,5,6
r3,0,4,3


In [70]:
print(df1["Name"])

0        Joey
1      Phoebe
2      Monica
3    Chandler
4        Ross
5      Rachel
Name: Name, dtype: object


In [71]:
print(df1.loc[3])

Name              Chandler
Marriage satus         1.0
Name: 3, dtype: object


In [72]:
print(df1.iloc[2])

Name              Monica
Marriage satus       1.0
Name: 2, dtype: object


In [74]:
print(df[["Name","Age"]])

     Name  Age
0  Rubina   28
1  Monica   35
2     Mac   22
3    Ajay   40


In [75]:
print(df[1:3])

     Name  Age
1  Monica   35
2     Mac   22


In [77]:
data = {"Name":["Jake","Charles","Santiago","Rosa"], "Height":[106,102,104,110]}
details_df = pd.DataFrame(data=data)
details_df

Unnamed: 0,Name,Height
0,Jake,106
1,Charles,102
2,Santiago,104
3,Rosa,110


In [78]:
height_above_102 = details_df[details_df["Height"]>102]
height_above_102

Unnamed: 0,Name,Height
0,Jake,106
2,Santiago,104
3,Rosa,110


In [79]:
details_df.shape

(4, 2)

In [80]:
details_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Height  4 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 196.0+ bytes


In [97]:
details_df.min()

Name      Charles
Height        102
dtype: object

In [98]:
details_df.max()

Name      Santiago
Height         110
dtype: object

In [101]:
data1 = {"Name":["Rubina","Marria","Emily","Nisha","Song","Ajay","Marc","Vijay"], "Sex":["F","F","F","F","M","M","M","M"], "Age":[28,23,56,21,45,30,29,34]}
personal_details = pd.DataFrame(data=data1)
personal_details

Unnamed: 0,Name,Sex,Age
0,Rubina,F,28
1,Marria,F,23
2,Emily,F,56
3,Nisha,F,21
4,Song,M,45
5,Ajay,M,30
6,Marc,M,29
7,Vijay,M,34


In [103]:
personal_details[personal_details["Name"]=="Marc"]

Unnamed: 0,Name,Sex,Age
6,Marc,M,29


In [115]:
personal_details[(personal_details["Sex"]=="M")&(personal_details["Age"]>30)]

Unnamed: 0,Name,Sex,Age
4,Song,M,45
7,Vijay,M,34


In [118]:
#loc - label based data selection. Pass the title of the (row,column) you want to select 
personal_details.loc[3,"Name"]

'Nisha'

In [119]:
personal_details.loc[0,"Age"]

28

In [120]:
#iloc is index based selecting method 
personal_details.iloc[0,2]

28

In [122]:
personal_details.iloc[0,0]

'Rubina'

In [123]:
df4 = personal_details
df4=df4.set_index("Name")
df4

Unnamed: 0_level_0,Sex,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Rubina,F,28
Marria,F,23
Emily,F,56
Nisha,F,21
Song,M,45
Ajay,M,30
Marc,M,29
Vijay,M,34


In [126]:
df4.loc["Marc","Age"]

29

In [128]:
personal_details.loc[0:6,"Name":"Sex"] #Both start and stop bound are included in loc()

Unnamed: 0,Name,Sex
0,Rubina,F
1,Marria,F
2,Emily,F
3,Nisha,F
4,Song,M
5,Ajay,M
6,Marc,M


In [136]:
personal_details.iloc[0:6, 0:2] # Here only the start is inclusive

Unnamed: 0,Name,Sex
0,Rubina,F
1,Marria,F
2,Emily,F
3,Nisha,F
4,Song,M
5,Ajay,M


In [138]:
personal_details.columns

Index(['Name', 'Sex', 'Age'], dtype='object')

In [142]:
personal_details.dtypes

Name    object
Sex     object
Age      int64
dtype: object

In [143]:
personal_details.values

array([['Rubina', 'F', 28],
       ['Marria', 'F', 23],
       ['Emily', 'F', 56],
       ['Nisha', 'F', 21],
       ['Song', 'M', 45],
       ['Ajay', 'M', 30],
       ['Marc', 'M', 29],
       ['Vijay', 'M', 34]], dtype=object)

In [144]:
personal_details.size

24

In [145]:
personal_details.shape

(8, 3)

### Series 

**class pandas.Series(data=None, index=None, dtype=None, name=None, copy=None, fastpath=NoDefault.no_default)**

In [26]:
# Creating series from a dictionery 

d = {"Name1":"Rubina","Name2":"Marrie","Name3":"Rosa"}
ser = pd.Series(data=d, index=["Name1","Name2","Name3"])
ser

Name1    Rubina
Name2    Marrie
Name3      Rosa
dtype: object

In [27]:
#Even if we don't give index value like above it is fine
d1 = {'a':2, 'b':4, 'c':5}
ser1 = pd.Series(data=d1)
ser1

a    2
b    4
c    5
dtype: int64

In [29]:
#Now we want to give different index to the ser data
#The dictionary index are use to make the series first
#But the index used in ser3 replaced the dictionary index therefore we get NaN 
d2 = {'a':123, 'b':928, 'c':235}
ser3 = pd.Series(data=d2, index=['x','y','z'])
ser3


x   NaN
y   NaN
z   NaN
dtype: float64

In [34]:
#Copy only affects a series or 1D ndarray input.
r = [3,4]
series1 = pd.Series(r,copy=False)
series1.iloc[0] = 9
print(series1)
print(r)

0    9
1    4
dtype: int64
[3, 4]


In [43]:
#input is 1D ndarray 
#If we keep the copy=False any changes in series will be reflected in the r1 data as well 
#If copy=True it will not affect the actual data only changes to the series take place
r1 = np.array([4,5,9,23,56,12,89])
series2 = pd.Series(r1, copy=False)
series2.iloc[1] = 99
series2.iloc[3] = 0
print("******** Series2 ********\n")
print(series2)
print("\n")
print("******** r1 ************\n")
print(r1)

******** Series2 ********

0     4
1    99
2     9
3     0
4    56
5    12
6    89
dtype: int64


******** r1 ************

[ 4 99  9  0 56 12 89]


In [44]:
#A series can be accessed as 
print(series2[3])

0


In [45]:
print(series2.iloc[5])

12


In [46]:
#Access multiple elements 
print(series2[1:6])

1    99
2     9
3     0
4    56
5    12
dtype: int64


In [51]:
#Working with index
items = ["sky","grass","road","flower"]
color = ["blue","green","black","red"]
details = pd.Series(color,index=items)
details.index


Index(['sky', 'grass', 'road', 'flower'], dtype='object')

In [52]:
details.index = ["Blu","Grn","Blk","Rd"]
details.index

Index(['Blu', 'Grn', 'Blk', 'Rd'], dtype='object')

In [53]:
details

Blu     blue
Grn    green
Blk    black
Rd       red
dtype: object

In [55]:
pd.Series([1,2,3,4]).array

<NumpyExtensionArray>
[1, 2, 3, 4]
Length: 4, dtype: int64

In [56]:
pd.Series(pd.Categorical(["Mind","Soul","Peace"]))

0     Mind
1     Soul
2    Peace
dtype: category
Categories (3, object): ['Mind', 'Peace', 'Soul']

In [60]:
pd.Series(["Rubina","Manny","Bob"]).values

array(['Rubina', 'Manny', 'Bob'], dtype=object)

In [61]:
pd.Series(series2).values

array([ 4, 99,  9,  0, 56, 12, 89])

In [62]:
series2.mean()

38.42857142857143

In [63]:
series2.max()

99

In [64]:
series2.min()

0

In [65]:
series2.unique()

array([ 4, 99,  9,  0, 56, 12, 89])

In [67]:
series2.sort_values()

3     0
0     4
2     9
5    12
4    56
6    89
1    99
dtype: int64

In [68]:
series2.sort_index()

0     4
1    99
2     9
3     0
4    56
5    12
6    89
dtype: int64

## Author
### Rubina