### Getting the .csv files to Data frames with pandas

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("Iris.csv")

### Explore the data

In [3]:
df.head() # First 5 rows

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df.tail() # last 5 rows

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [5]:
df.head(3) # 3 can be any number of rows

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa


In [0]:
df.tail(3)

In [6]:
# Information of the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [7]:
df.describe() # Only Numerical columns are considered for this 

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


### How many columns  and how many rows 

In [8]:
df.shape

(150, 6)

### How many Rows

In [9]:
# Number of rows
df.shape[0]

150

In [10]:
# Number of rows
len(df.index)

150

### How many columns

In [11]:
# How many columns another way
len(df.columns)

6

In [12]:
## How many columns
df.shape[1]

6

In [13]:
# What are the columns 
list(df.columns)

['Id',
 'SepalLengthCm',
 'SepalWidthCm',
 'PetalLengthCm',
 'PetalWidthCm',
 'Species']

### Accessing the columns 

#### With Names 

In [14]:
df['Id'].head() #  head () function is used to just the limit the printing space

0    1
1    2
2    3
3    4
4    5
Name: Id, dtype: int64

#### With Names and dot notation

In [0]:
df.Id.head() # With Dot notation

#### With brackets and iloc function

In [0]:
df.iloc[:,0]. head() # With numbers and  iloc function Note: 0 is used for column1 

### Accessing the multiple columns

In [0]:
df[['SepalLengthCm', 'SepalWidthCm']]. head() # Note double brackets altrnatively a list of columns can be supplied

### Accessing the rows 

In [0]:
# Access the row number 23
df.iloc[23,:]

In [0]:
df.loc[[23]] # With loc function

In [0]:
df.loc[[23,30]] # 23 and 30 the rows 

In [0]:
df.iloc[23:28,] #23 to 28 rows with iloc function

### Conditional Filtering of the data

In [0]:
# Get all the rows where Species = Iris-Setosa
df[df.Species == 'Iris-setosa'].head() # Notice df[df.] head added for limiting the space

In [0]:
df[df['Species'] == 'Iris-setosa'].head()

In [0]:
# Get the data where Sepalwidthcm >=3 and Species = 'Iris-Setosa'
df[(df['Species'] == 'Iris-setosa') & (df['SepalWidthCm'] >= 3.0)]. head() # Multiple logical conditions can be cascaded 

#### Adding a new column to the data frame 

In [0]:
# Add a column SepalRatio, PetalRatio Length/Width

In [0]:
df['SepalRatio'] = df['SepalLengthCm']/df['SepalWidthCm']

In [0]:
df['PetalRatio'] = df.PetalLengthCm/df.PetalWidthCm

In [0]:
df.head()

### Removing a column

In [0]:
# Remove the SepalRatio column
# drop() Method
# Remove column name 'A' 
df.drop(['SepalRatio'], axis = 1).head() 

In [0]:
df.head() # Please note that, the drop method creates a new dataframe Unless you re assign the data frame to same  or use an option inplace = True, change is is not effective

In [0]:
df.drop(['SepalRatio'], axis = 1, inplace= True) # Inplace removes the column and re assigns it to df 

In [0]:
df.head()

In [0]:
## Added two dummy columns
df['dummy1'] = 0
df['dummy2'] = 1

In [0]:
df.head()

In [0]:
## Remove these two dummy columns one go
df.drop(['dummy1', 'dummy2'], axis = 1, inplace = True)

In [0]:
df.head()