# Pandas
- Pandas is a open source python library
- Pandas is a data analysis library
- Used for working with data sets

# Importing

In [1]:
#import packages
import pandas as pd
import numpy as np

## Pandas Series
- One dimensional sequence of heterogeneous elements

### Indexing
- By using index variable we can give our own names to labels

### 1. Creating series using list

In [2]:
l1=[1,2,3,4,5,6]
index_list=['a','b','c','d','e','f']
s1=pd.Series(data=l1,index=index_list)
print(s1)
print(type(s1))
print(type(l1))

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64
<class 'pandas.core.series.Series'>
<class 'list'>


### 2. Creating series using tuple

In [3]:
t1=(1,2,3,4,5,6)
index_list=('a','b','c','d','e','f')
s2=pd.Series(data=t1,index=index_list)
print(s2)
print(type(s2))
print(type(t1))

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64
<class 'pandas.core.series.Series'>
<class 'tuple'>


### 3. creating series using dictionary

In [4]:
d1={'a':1,'b':2,'c':3,'d':4,'e':5}
s3=pd.Series(data=d1)
print(s3)
print(type(d1))
print(type(s3))

a    1
b    2
c    3
d    4
e    5
dtype: int64
<class 'dict'>
<class 'pandas.core.series.Series'>


### 4. Creating series using numpy arrays

In [5]:
s4=pd.Series(data=np.arange(5))
print(s4)
print(type(s4))

0    0
1    1
2    2
3    3
4    4
dtype: int32
<class 'pandas.core.series.Series'>


# DataFrames
- A Data frame is a two-dimensional or multi-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns.
- Series is like a column, a DataFrame is the whole table.

### Creating DataFrame

In [6]:
pd.concat([s3,s4])

a    1
b    2
c    3
d    4
e    5
0    0
1    1
2    2
3    3
4    4
dtype: int64

#### If axis=1 then it is row wise

In [7]:
df=pd.concat([s3,s4],axis=1) 
print(df)
print(type(df))                    #row wise

     0    1
a  1.0  NaN
b  2.0  NaN
c  3.0  NaN
d  4.0  NaN
e  5.0  NaN
0  NaN  0.0
1  NaN  1.0
2  NaN  2.0
3  NaN  3.0
4  NaN  4.0
<class 'pandas.core.frame.DataFrame'>


#### If axis=0 then it is column wise

In [8]:
df=pd.concat([s3,s4],axis=0) 
print(df)
print(type(df))                    #rcolumn wise

a    1
b    2
c    3
d    4
e    5
0    0
1    1
2    2
3    3
4    4
dtype: int64
<class 'pandas.core.series.Series'>


### Locate row
- Pandas use the loc attribute to return one or more specified row(s)
- loc () and iloc () are one of those methods. These are used in slicing of data from the Pandas DataFrame. 

In [9]:
print(df.loc['a'])

1


In [10]:
print(df.loc[4])

4


In [11]:
print(df.loc['d'])

4


In [12]:
print(df.loc[['c', 1]])

c    3
1    1
dtype: int64


In [13]:
print(df.loc[['c', ]])

c    3
dtype: int64


In [14]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}
df = pd.DataFrame(data, index = ["day1", "day2", "day3"])
print(df) 
print(type(df.iloc[420:]))

      calories  duration
day1       420        50
day2       380        40
day3       390        45
<class 'pandas.core.frame.DataFrame'>


In [15]:
print(type(df.iloc[420:390]))

<class 'pandas.core.frame.DataFrame'>


In [16]:
df=pd.concat([s3,s4],axis=1) 
print(df)
print(type(df))            #row wise 
print(type(df.iloc[:,0]))  #to check type of each column
df.columns=['col_a','col_b']       #setting column names

     0    1
a  1.0  NaN
b  2.0  NaN
c  3.0  NaN
d  4.0  NaN
e  5.0  NaN
0  NaN  0.0
1  NaN  1.0
2  NaN  2.0
3  NaN  3.0
4  NaN  4.0
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


### To change row names use index function

In [17]:
## creating dataframe
df=pd.DataFrame({"Name":['Tom','Nick','John','Peter'],
                 "Age":[15,26,17,28]})
df

Unnamed: 0,Name,Age
0,Tom,15
1,Nick,26
2,John,17
3,Peter,28


In [18]:
# Change the column names
df.columns =['Col_1', 'Col_2']
# Change the row indexes
df.index = ['Row_1', 'Row_2', 'Row_3', 'Row_4']
# printing the data frame
df

Unnamed: 0,Col_1,Col_2
Row_1,Tom,15
Row_2,Nick,26
Row_3,John,17
Row_4,Peter,28


## DataFrame
### * Using list

In [19]:
#create a dataframe  using list
l1=[[1,2,3],[4,5,6],[7,8,90]]
df1=pd.DataFrame(data=l1)
df1

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,90


In [20]:
df1=pd.DataFrame(data=l1,index=['row1','row2','row3'],columns=['col1','col2','col3'])
(df1)

Unnamed: 0,col1,col2,col3
row1,1,2,3
row2,4,5,6
row3,7,8,90


In [21]:
print(df1.columns)
print(df1.index)

Index(['col1', 'col2', 'col3'], dtype='object')
Index(['row1', 'row2', 'row3'], dtype='object')


## * Using Dictionary

In [22]:
d2={'col1':[1,2,3],'col2':[4,5,6],'col3':[7,8,9]}
df2=pd.DataFrame(data=d2,index=['row1','row2','row3'])
df2

Unnamed: 0,col1,col2,col3
row1,1,4,7
row2,2,5,8
row3,3,6,9


### * Using numpy array

In [23]:
#creating dataframe using numpy arrays----random function
df3=pd.DataFrame(data=np.random.rand(3,2))
df3

Unnamed: 0,0,1
0,0.57106,0.881459
1,0.504815,0.236672
2,0.609317,0.061808


In [24]:
df3=pd.DataFrame(data=np.random.rand(3,2),index=['row1','row2','row3'],columns=['col1','col2'])
df3

Unnamed: 0,col1,col2
row1,0.01903,0.949367
row2,0.95016,0.20559
row3,0.775641,0.225655


In [25]:
d3={'Name':['Rahul','Sohan','Rachana'],'Age':[20,21,23],'Degree':['bca','bba','be'],'Percentage':[60,70,80]}
data=pd.DataFrame(data=d3,index=['row1','row2','row3'])
data

Unnamed: 0,Name,Age,Degree,Percentage
row1,Rahul,20,bca,60
row2,Sohan,21,bba,70
row3,Rachana,23,be,80


In [26]:
data.Name

row1      Rahul
row2      Sohan
row3    Rachana
Name: Name, dtype: object

In [27]:
type(data.Name)

pandas.core.series.Series

In [28]:
print(data['Name'])
type(data['Name'])

row1      Rahul
row2      Sohan
row3    Rachana
Name: Name, dtype: object


pandas.core.series.Series

In [29]:
print(data['Name'])
print(data['Age'])

row1      Rahul
row2      Sohan
row3    Rachana
Name: Name, dtype: object
row1    20
row2    21
row3    23
Name: Age, dtype: int64


In [30]:
print(data[["Name", "Age"]])

         Name  Age
row1    Rahul   20
row2    Sohan   21
row3  Rachana   23


In [31]:
print(data[["Name", "Age"]][0:2])

       Name  Age
row1  Rahul   20
row2  Sohan   21


In [32]:
print(data[["Name", "Age"]][0:])

         Name  Age
row1    Rahul   20
row2    Sohan   21
row3  Rachana   23


In [33]:
print(data[["Name", "Age"]][0:2])
print(data[0:2])

       Name  Age
row1  Rahul   20
row2  Sohan   21
       Name  Age Degree  Percentage
row1  Rahul   20    bca          60
row2  Sohan   21    bba          70


## * Using loc and iloc
- Location based
- Integer locking

In [34]:
data.loc[:]                   ##actual dataset

Unnamed: 0,Name,Age,Degree,Percentage
row1,Rahul,20,bca,60
row2,Sohan,21,bba,70
row3,Rachana,23,be,80


In [35]:
data.loc[:,["Name"]]

Unnamed: 0,Name
row1,Rahul
row2,Sohan
row3,Rachana


In [36]:
data.loc[:,['Name','Degree']]

Unnamed: 0,Name,Degree
row1,Rahul,bca
row2,Sohan,bba
row3,Rachana,be


In [37]:
data.loc[['row1','row2'],['Name','Degree']]             #accesing randomly

Unnamed: 0,Name,Degree
row1,Rahul,bca
row2,Sohan,bba


## iloc:

In [38]:
data.iloc[0:2,0:2]

Unnamed: 0,Name,Age
row1,Rahul,20
row2,Sohan,21


In [39]:
data.iloc[[0,1],[0,2]]

Unnamed: 0,Name,Degree
row1,Rahul,bca
row2,Sohan,bba


In [40]:
data.iloc[[0,1],[0,2]]

Unnamed: 0,Name,Degree
row1,Rahul,bca
row2,Sohan,bba


In [41]:
#renaming the columns
data.rename({'Name':'First Name'},axis=1)           #inplace=False for temporary changes in dataframe

Unnamed: 0,First Name,Age,Degree,Percentage
row1,Rahul,20,bca,60
row2,Sohan,21,bba,70
row3,Rachana,23,be,80


In [42]:
# lets see our data whether is has changes or not
data                   

Unnamed: 0,Name,Age,Degree,Percentage
row1,Rahul,20,bca,60
row2,Sohan,21,bba,70
row3,Rachana,23,be,80


In [43]:
data.rename({'Name':'First Name'},axis=1,inplace=True)  #making permenant changes to the dataframe
data

Unnamed: 0,First Name,Age,Degree,Percentage
row1,Rahul,20,bca,60
row2,Sohan,21,bba,70
row3,Rachana,23,be,80


In [44]:
data                      ##use inplace=True to make permanent changes

Unnamed: 0,First Name,Age,Degree,Percentage
row1,Rahul,20,bca,60
row2,Sohan,21,bba,70
row3,Rachana,23,be,80


##  Inserting column

In [45]:
data.insert(2,"phone number",['93983','19511','5843'])
data

Unnamed: 0,First Name,Age,phone number,Degree,Percentage
row1,Rahul,20,93983,bca,60
row2,Sohan,21,19511,bba,70
row3,Rachana,23,5843,be,80


In [46]:
# how to add multiple columns
data['Address'],data['Fees'],data['Corse_opted'],data['Roll_number']=['Hyd','Bang','Delhi'],[2000,3000,2500],['AI','Data science','Python'],[123,256,489]
data

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Corse_opted,Roll_number
row1,Rahul,20,93983,bca,60,Hyd,2000,AI,123
row2,Sohan,21,19511,bba,70,Bang,3000,Data science,256
row3,Rachana,23,5843,be,80,Delhi,2500,Python,489


## Changing specific value in dataframe:

In [47]:
#How to change specific value in dataframe
data=data.replace('bba','bca')
data                        #hence bba is replaced with bca

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Corse_opted,Roll_number
row1,Rahul,20,93983,bca,60,Hyd,2000,AI,123
row2,Sohan,21,19511,bca,70,Bang,3000,Data science,256
row3,Rachana,23,5843,be,80,Delhi,2500,Python,489


In [48]:
##accessing
data['Degree'][0]

'bca'

In [49]:
data.loc[['row1'],['Degree']]='be'
data

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Corse_opted,Roll_number
row1,Rahul,20,93983,be,60,Hyd,2000,AI,123
row2,Sohan,21,19511,bca,70,Bang,3000,Data science,256
row3,Rachana,23,5843,be,80,Delhi,2500,Python,489


In [50]:
#dropping a row
#axis=0
data.drop(['Corse_opted'],axis=1,inplace=True)     

In [51]:
data

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60,Hyd,2000,123
row2,Sohan,21,19511,bca,70,Bang,3000,256
row3,Rachana,23,5843,be,80,Delhi,2500,489


In [52]:
df=pd.DataFrame([[1,2],[3,4]],columns=list('AB'),index=['x','y'])
df2=pd.DataFrame([[5,6],[7,8]],columns=list('AB'),index=['x','y'])
print(df)
print(' ')
print(df2)

   A  B
x  1  2
y  3  4
 
   A  B
x  5  6
y  7  8


In [53]:
data['Percentage']=[60,np.NaN,30]
data

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,,Bang,3000,256
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


In [54]:
#Data preprocessing

#drop s using drop function
data.drop('row2')

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


In [55]:
data.dropna()

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


In [56]:
#dropping row3

data.drop('row3')

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,,Bang,3000,256


In [57]:
row3={'First Name': 'Hemanth','Age':24,'phone number':123456,'Degree': 'Ca', 'Percentage':94,'Address':'delhi','Fees':9852,'Roll_number':52364,'hours_study':3}
data.append(row3,ignore_index=True)

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number,hours_study
0,Rahul,20,93983,be,60.0,Hyd,2000,123,
1,Sohan,21,19511,bca,,Bang,3000,256,
2,Rachana,23,5843,be,30.0,Delhi,2500,489,
3,Hemanth,24,123456,Ca,94.0,delhi,9852,52364,3.0


### When do we impute nan with mean ??
-- Note that imputing missing data with mean values can only be done with numerical data.

In [58]:
##impute missing values with mean
data.fillna(value=data.Percentage.mean())

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,45.0,Bang,3000,256
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


## When do we impute nan with median ??
-- Note that imputing missing data with median value can only be done with numerical data.

In [59]:
data.fillna(value=data.Percentage.median())

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,45.0,Bang,3000,256
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


## When do we impute nan with mode ??
-- Note that imputing missing data with median value can only be done with categorical data.

In [61]:
data.fillna(value=data.Percentage.mode())
# percentage is not a categorical data so it remains nan only

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,,Bang,3000,256
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


In [62]:
data.backfill()        ##Backward fill

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,30.0,Bang,3000,256
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


In [64]:
data.ffill()          ##Forward fill

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,60.0,Bang,3000,256
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


In [65]:
data

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,,Bang,3000,256
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


In [66]:
row4={'First Name': 'Alaric','Age':21,'phone number':128856, 'Percentage':84,'Address':'Gurgon','Fees':9052,'Roll_number':51164,'hours_study':4}
data.append(row4,ignore_index=True)

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number,hours_study
0,Rahul,20,93983,be,60.0,Hyd,2000,123,
1,Sohan,21,19511,bca,,Bang,3000,256,
2,Rachana,23,5843,be,30.0,Delhi,2500,489,
3,Alaric,21,128856,,84.0,Gurgon,9052,51164,4.0


In [67]:
#data.fillna(value=data.Degree.mode())
data['Degree']=data['Degree'].fillna(data['Degree'].mode()[0])
data

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,,Bang,3000,256
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


## head()
- The head () function is used to get the first n rows.
- It is useful for quickly testing if your object has the right type of data in it.

In [68]:
data.head(2)

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,,Bang,3000,256


## tail()
- The tail () function is used to return the last n rows.

In [69]:
data.tail(1)

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


## info()
- dataframe.info () function is used to get a concise summary of the dataframe.

In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, row1 to row3
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   First Name    3 non-null      object 
 1   Age           3 non-null      int64  
 2   phone number  3 non-null      object 
 3   Degree        3 non-null      object 
 4   Percentage    2 non-null      float64
 5   Address       3 non-null      object 
 6   Fees          3 non-null      int64  
 7   Roll_number   3 non-null      int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 324.0+ bytes


## describe()
- Pandas describe() is used to view some basic statistical details like percentile, mean, std etc. of a data frame or a series of numeric values.

In [71]:
data.describe()

Unnamed: 0,Age,Percentage,Fees,Roll_number
count,3.0,2.0,3.0,3.0
mean,21.333333,45.0,2500.0,289.333333
std,1.527525,21.213203,500.0,185.262876
min,20.0,30.0,2000.0,123.0
25%,20.5,37.5,2250.0,189.5
50%,21.0,45.0,2500.0,256.0
75%,22.0,52.5,2750.0,372.5
max,23.0,60.0,3000.0,489.0


In [72]:
data.describe(include='all')

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
count,3,3.0,3.0,3,2.0,3,3.0,3.0
unique,3,,3.0,2,,3,,
top,Rahul,,5843.0,be,,Delhi,,
freq,1,,1.0,2,,1,,
mean,,21.333333,,,45.0,,2500.0,289.333333
std,,1.527525,,,21.213203,,500.0,185.262876
min,,20.0,,,30.0,,2000.0,123.0
25%,,20.5,,,37.5,,2250.0,189.5
50%,,21.0,,,45.0,,2500.0,256.0
75%,,22.0,,,52.5,,2750.0,372.5


In [73]:
data.isnull()
## it will return true wherever there is a null value

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,False,False,False,False,False,False,False,False
row2,False,False,False,False,True,False,False,False
row3,False,False,False,False,False,False,False,False


In [74]:
data.isnull().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of       First Name    Age  phone number  Degree  Percentage  Address   Fees  \
row1       False  False         False   False       False    False  False   
row2       False  False         False   False        True    False  False   
row3       False  False         False   False       False    False  False   

      Roll_number  
row1        False  
row2        False  
row3        False  >

In [75]:
data.isnull().sum()

First Name      0
Age             0
phone number    0
Degree          0
Percentage      1
Address         0
Fees            0
Roll_number     0
dtype: int64

In [76]:
data.isna()

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,False,False,False,False,False,False,False,False
row2,False,False,False,False,True,False,False,False
row3,False,False,False,False,False,False,False,False


In [77]:
data.isna().sum()

First Name      0
Age             0
phone number    0
Degree          0
Percentage      1
Address         0
Fees            0
Roll_number     0
dtype: int64

In [78]:
data

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123
row2,Sohan,21,19511,bca,,Bang,3000,256
row3,Rachana,23,5843,be,30.0,Delhi,2500,489


In [80]:
data['Percentage']>=60

row1     True
row2    False
row3    False
Name: Percentage, dtype: bool

In [81]:
data[0:1]

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row1,Rahul,20,93983,be,60.0,Hyd,2000,123


In [83]:
data[1:2]

Unnamed: 0,First Name,Age,phone number,Degree,Percentage,Address,Fees,Roll_number
row2,Sohan,21,19511,bca,,Bang,3000,256


In [84]:
# Filterouting the particular data 
# person who score more than 60 and he is from banglore
newdata = data[(data.Percentage >= 60) & (data.Address == "Hyd")]              ##Using AND(&)
print(newdata)

     First Name  Age phone number Degree  Percentage Address  Fees  \
row1      Rahul   20        93983     be        60.0     Hyd  2000   

      Roll_number  
row1          123  


In [85]:
newdata = data[(data.Percentage <= 60) | (data.Address == "Hyd")]               ##Using OR(|)
print(newdata)

     First Name  Age phone number Degree  Percentage Address  Fees  \
row1      Rahul   20        93983     be        60.0     Hyd  2000   
row3    Rachana   23         5843     be        30.0   Delhi  2500   

      Roll_number  
row1          123  
row3          489  


In [86]:
# Number of missing values in column

count_nan=data.isnull().sum()
print(count_nan[count_nan>0])

Percentage    1
dtype: int64
