![image.png](attachment:image.png)

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from numpy.random import randn

# Series

In [2]:
labels = ['Anish','Bikraj','Rishad']
my_list = [1,4,5]
arr = np.array([1,4,5])
d = {'Anish':1,'Bikraj':4,'Rishad':5}

###### Using List

In [3]:
pd.Series(data=my_list)

0    1
1    4
2    5
dtype: int64

In [4]:
print("Familarity With Python")
pd.Series(data=my_list,index=labels)

Familarity With Python


Anish     1
Bikraj    4
Rishad    5
dtype: int64

###### Using Numpy Array

In [5]:
pd.Series(data = arr)

0    1
1    4
2    5
dtype: int32

In [6]:
pd.Series(arr,labels)

Anish     1
Bikraj    4
Rishad    5
dtype: int32

###### Using Dictionary

In [7]:
ser = pd.Series(d)
ser

Anish     1
Bikraj    4
Rishad    5
dtype: int64

In [8]:
ser = pd.Series(d)
ser

0     Anish
1    Bikraj
2    Rishad
dtype: object

In [9]:
ser['Rishad']

5

![image.png](attachment:image.png)

In [10]:
ser1 = pd.Series([1,2,3,4],index = ['USA', 'Germany','USSR', 'Japan'])        
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [11]:
ser2 = pd.Series([1,2,5,4],index = ['USA', 'Germany','Italy', 'Japan'])     
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

* The main data type of Pandas is **series** which is similar to a numpy array but is indexed by a label
* It can be created using a list, numpy, array or dictionary
* Accessing the element can be done with **[ ]**.
* Slicing is also supported on the pandas
* It works like a hash table or dictionray as it allows for fast look ups of information


> Binary Addition is done with the help of index

In [12]:
ser1+ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

In [13]:
ser1.add(ser2)

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

https://pythontic.com/pandas/series-binaryoperatorfunctions/add#:~:text=Series%20instances%20can%20be%20added,both%20the%20series%20added%20up.

# DataFrame
![image.png](attachment:image.png)

In [14]:
df = pd.DataFrame(randn(5,4),index=('A', 'B' ,'C','D','E'),columns='W X Y Z'.split())

In [15]:
df

Unnamed: 0,W,X,Y,Z
A,-0.412905,-0.227466,-0.360223,-0.428678
B,-1.408072,-0.244111,-2.316725,-2.525666
C,0.99592,1.242858,-0.07392,-0.740383
D,-0.813091,0.201,0.060336,0.506079
E,-0.812846,-0.675002,1.954479,0.598865


In [16]:
type(df)

pandas.core.frame.DataFrame

In [17]:
type(df['W'])

pandas.core.series.Series

In [18]:
df['new'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,new
A,-0.412905,-0.227466,-0.360223,-0.428678,-0.773128
B,-1.408072,-0.244111,-2.316725,-2.525666,-3.724797
C,0.99592,1.242858,-0.07392,-0.740383,0.922001
D,-0.813091,0.201,0.060336,0.506079,-0.752755
E,-0.812846,-0.675002,1.954479,0.598865,1.141633


In [19]:
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,-0.412905,-0.227466,-0.360223,-0.428678
B,-1.408072,-0.244111,-2.316725,-2.525666
C,0.99592,1.242858,-0.07392,-0.740383
D,-0.813091,0.201,0.060336,0.506079
E,-0.812846,-0.675002,1.954479,0.598865


In [20]:
df

Unnamed: 0,W,X,Y,Z,new
A,-0.412905,-0.227466,-0.360223,-0.428678,-0.773128
B,-1.408072,-0.244111,-2.316725,-2.525666,-3.724797
C,0.99592,1.242858,-0.07392,-0.740383,0.922001
D,-0.813091,0.201,0.060336,0.506079,-0.752755
E,-0.812846,-0.675002,1.954479,0.598865,1.141633


Use of `inplace = True`

In [21]:
df.drop('new',axis=1,inplace=True)

In [22]:
df

Unnamed: 0,W,X,Y,Z
A,-0.412905,-0.227466,-0.360223,-0.428678
B,-1.408072,-0.244111,-2.316725,-2.525666
C,0.99592,1.242858,-0.07392,-0.740383
D,-0.813091,0.201,0.060336,0.506079
E,-0.812846,-0.675002,1.954479,0.598865


In [23]:
df.drop('C',axis=0)

Unnamed: 0,W,X,Y,Z
A,-0.412905,-0.227466,-0.360223,-0.428678
B,-1.408072,-0.244111,-2.316725,-2.525666
D,-0.813091,0.201,0.060336,0.506079
E,-0.812846,-0.675002,1.954479,0.598865


* `axis=0` computes the function along the rows(index); act on all the Rows in each column
* `axis=1` computes the function along the columns; act on all the columns in each row
> More on https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean
![image.png](attachment:image.png)

In [24]:
df

Unnamed: 0,W,X,Y,Z
A,-0.412905,-0.227466,-0.360223,-0.428678
B,-1.408072,-0.244111,-2.316725,-2.525666
C,0.99592,1.242858,-0.07392,-0.740383
D,-0.813091,0.201,0.060336,0.506079
E,-0.812846,-0.675002,1.954479,0.598865


* A table of data in pandas is stored in a DataFrame 
* Dataframes are the workhouse of the pandas, inspired from R programming language
* Series is one-dimensional and DataFrame is 2-dimensional
* More on 
https://towardsdatascience.com/intro-to-pandas-for-excel-super-users-dac1b38f12b0
https://morioh.com/p/144857f81533
https://www.datacamp.com/tutorial/pandas-tutorial-dataframe-python
https://www.youtube.com/watch?v=Er2mm7Vmvy4

# Selecting Data

In [25]:
# df.loc?

In [26]:
df

Unnamed: 0,W,X,Y,Z
A,-0.412905,-0.227466,-0.360223,-0.428678
B,-1.408072,-0.244111,-2.316725,-2.525666
C,0.99592,1.242858,-0.07392,-0.740383
D,-0.813091,0.201,0.060336,0.506079
E,-0.812846,-0.675002,1.954479,0.598865


In [27]:
df.index

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [28]:
df.loc['A']

W   -0.412905
X   -0.227466
Y   -0.360223
Z   -0.428678
Name: A, dtype: float64

In [29]:
df.loc['B','Y']

-2.316724960380021

In [30]:
df.loc[['A','B'],['W']]

Unnamed: 0,W
A,-0.412905
B,-1.408072


In [31]:
df.loc['A':'D','W':'Y']

Unnamed: 0,W,X,Y
A,-0.412905,-0.227466,-0.360223
B,-1.408072,-0.244111,-2.316725
C,0.99592,1.242858,-0.07392
D,-0.813091,0.201,0.060336


![image.png](attachment:image.png)

## `df.loc()` with conditions

In [32]:
df.loc[df.W>0,:]

Unnamed: 0,W,X,Y,Z
C,0.99592,1.242858,-0.07392,-0.740383


In [33]:
df.loc[:,'Y']

A   -0.360223
B   -2.316725
C   -0.073920
D    0.060336
E    1.954479
Name: Y, dtype: float64

# `df.iloc()`

In [34]:
df

Unnamed: 0,W,X,Y,Z
A,-0.412905,-0.227466,-0.360223,-0.428678
B,-1.408072,-0.244111,-2.316725,-2.525666
C,0.99592,1.242858,-0.07392,-0.740383
D,-0.813091,0.201,0.060336,0.506079
E,-0.812846,-0.675002,1.954479,0.598865


In [35]:
df.iloc[2]

W    0.995920
X    1.242858
Y   -0.073920
Z   -0.740383
Name: C, dtype: float64

In [36]:
df.iloc[[2,4],1]

C    1.242858
E   -0.675002
Name: X, dtype: float64

In [37]:
list(df.W >0)

[False, False, True, False, False]

In [38]:
df.iloc[list(df.W >0),:]

Unnamed: 0,W,X,Y,Z
C,0.99592,1.242858,-0.07392,-0.740383


![image.png](attachment:image.png)

* `loc` is label-based, which means that you have to specify rows and columns based on their row and column labels.
* It also supports slicing ,selecting via conditions
* `loc[row_label,column_label]
*`iloc` is integer position-based, so you have to specify rows and columns by their integer position values (0-based integer position).
* `iloc[row_position, column_position]`
More on:
https://towardsdatascience.com/how-to-use-loc-and-iloc-for-selecting-data-in-pandas-bd09cb4c3d79

# Operations on Dataframe

![image.png](attachment:image.png)

In [39]:
df>0

Unnamed: 0,W,X,Y,Z
A,False,False,False,False
B,False,False,False,False
C,True,True,False,False
D,False,True,True,True
E,False,False,True,True


In [40]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,,,
B,,,,
C,0.99592,1.242858,,
D,,0.201,0.060336,0.506079
E,,,1.954479,0.598865


In [41]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
C,0.99592,1.242858,-0.07392,-0.740383


In [42]:
df

Unnamed: 0,W,X,Y,Z
A,-0.412905,-0.227466,-0.360223,-0.428678
B,-1.408072,-0.244111,-2.316725,-2.525666
C,0.99592,1.242858,-0.07392,-0.740383
D,-0.813091,0.201,0.060336,0.506079
E,-0.812846,-0.675002,1.954479,0.598865


In [43]:
df[df['W']>0]['Y']

C   -0.07392
Name: Y, dtype: float64

In [44]:
df[df['W']>0][['Y','X']]

Unnamed: 0,Y,X
C,-0.07392,1.242858


In [45]:
df[(df['W']>0) & (df['Y'] <0)]

Unnamed: 0,W,X,Y,Z
C,0.99592,1.242858,-0.07392,-0.740383


In [46]:
df.reset_index(inplace = True)

In [47]:
df

Unnamed: 0,index,W,X,Y,Z
0,A,-0.412905,-0.227466,-0.360223,-0.428678
1,B,-1.408072,-0.244111,-2.316725,-2.525666
2,C,0.99592,1.242858,-0.07392,-0.740383
3,D,-0.813091,0.201,0.060336,0.506079
4,E,-0.812846,-0.675002,1.954479,0.598865


In [48]:
# df.set_index?

In [49]:
df.set_index('index',inplace=True)

In [50]:
df

Unnamed: 0_level_0,W,X,Y,Z
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,-0.412905,-0.227466,-0.360223,-0.428678
B,-1.408072,-0.244111,-2.316725,-2.525666
C,0.99592,1.242858,-0.07392,-0.740383
D,-0.813091,0.201,0.060336,0.506079
E,-0.812846,-0.675002,1.954479,0.598865


In [51]:
df = pd.DataFrame({'CE':[1,2,0],
                  'CS':[5,0,2],
                  'EE':[11,20,0],
                  'Pharmacy':[23,15,np.nan]},index=['C++','Math','Electrical'])

In [52]:
df

Unnamed: 0,CE,CS,EE,Pharmacy
C++,1,5,11,23.0
Math,2,0,20,15.0
Electrical,0,2,0,


In [53]:
df.dropna()


Unnamed: 0,CE,CS,EE,Pharmacy
C++,1,5,11,23.0
Math,2,0,20,15.0


In [54]:
df.dropna(axis=1)

Unnamed: 0,CE,CS,EE
C++,1,5,11
Math,2,0,20
Electrical,0,2,0


In [55]:
df.dropna(thresh=2)

Unnamed: 0,CE,CS,EE,Pharmacy
C++,1,5,11,23.0
Math,2,0,20,15.0
Electrical,0,2,0,


* Numpy has an advance operations and needs to be studied with different axis
* `.dropna()`is an useful method to dropping nan value

### Filling Values 

![image-2.png](attachment:image-2.png)

In [56]:
df.fillna(value='FILL VALUE')

Unnamed: 0,CE,CS,EE,Pharmacy
C++,1,5,11,23.0
Math,2,0,20,15.0
Electrical,0,2,0,FILL VALUE


In [57]:
df['Pharmacy'].fillna(value=df['Pharmacy'].mean(),inplace=True)

In [58]:
df

Unnamed: 0,CE,CS,EE,Pharmacy
C++,1,5,11,23.0
Math,2,0,20,15.0
Electrical,0,2,0,19.0


In [59]:
data = {'Subject':['DBMS','DBMS','MCSC','MCSC','DIFF','DIFF'],
       'Instructor':['Santosh','Nabin','Gokul','Ram','Saraswoti','Khim'],
       'Assignments':[10,2,4,5,6,4]}

In [60]:
df = pd.DataFrame(data)
df

Unnamed: 0,Subject,Instructor,Assignments
0,DBMS,Santosh,10
1,DBMS,Nabin,2
2,MCSC,Gokul,4
3,MCSC,Ram,5
4,DIFF,Saraswoti,6
5,DIFF,Khim,4


In [61]:
by_comp = df.groupby('Subject')
by_comp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002B8A1858490>

In [62]:
by_comp.mean()

Unnamed: 0_level_0,Assignments
Subject,Unnamed: 1_level_1
DBMS,6.0
DIFF,5.0
MCSC,4.5


![image.png](attachment:image.png)
Italian surgeon Sergio Canavero said a team had 'realised the first human head transplant

In [63]:
df.head()

Unnamed: 0,Subject,Instructor,Assignments
0,DBMS,Santosh,10
1,DBMS,Nabin,2
2,MCSC,Gokul,4
3,MCSC,Ram,5
4,DIFF,Saraswoti,6


In [64]:
df.describe()

Unnamed: 0,Assignments
count,6.0
mean,5.166667
std,2.71416
min,2.0
25%,4.0
50%,4.5
75%,5.75
max,10.0


In [65]:
df['Subject'].unique()

array(['DBMS', 'MCSC', 'DIFF'], dtype=object)

In [66]:
df['Instructor'].nunique()

6

In [67]:
df['Subject'].value_counts()

DBMS    2
MCSC    2
DIFF    2
Name: Subject, dtype: int64

In [68]:
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [69]:
newdf = df[(df['col1']>2) & (df['col2']==444)]
newdf

Unnamed: 0,col1,col2,col3
3,4,444,xyz


In [70]:
def times2(x):
    return x*2

In [71]:
df.apply(times2)

Unnamed: 0,col1,col2,col3
0,2,888,abcabc
1,4,1110,defdef
2,6,1332,ghighi
3,8,888,xyzxyz


In [72]:
df.isnull()

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [73]:
# Drop rows with NaN Values
df.dropna()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


![image.png](attachment:image.png)