# Importing the library

In [88]:
import pandas as pd

### pd is my alias here for my pandas library instance so that it is easier to be referred to.

***

## How to make a dataframe in pandas?

In [89]:
pd.DataFrame({'column1':[1, 2, 3, 4, 5], 'column2':[1, 3, 5, 7, 9], 'column3':[2, 4, 6, 8, 10]})

Unnamed: 0,column1,column2,column3
0,1,1,2
1,2,3,4
2,3,5,6
3,4,7,8
4,5,9,10


### Now if we have to save it in another variable

In [90]:
data = pd.DataFrame({'column1':[1, 2, 3, 4, 5], 'column2':[1, 3, 5, 7, 9], 'column3':[2, 4, 6, 8, 10]})

### The dataframe is stored in a variable named 'data'

In [91]:
data

Unnamed: 0,column1,column2,column3
0,1,1,2
1,2,3,4
2,3,5,6
3,4,7,8
4,5,9,10


### This dataframe has total 5 rows. However in real life situations, we do not have such small data. So how to manage huge dataframes?

In [92]:
data1 = pd.DataFrame({'column1':[1, 2, 3, 4, 5, 5, 6, 2, 5, 9, 2, 4, 9, 4, 2, 5, 4, 4, 1, 6, 1, 2, 3], 'column2':[1, 2, 3, 4, 5, 5, 6, 2, 5, 9, 2, 4, 9, 4, 2, 5, 4, 4, 1, 6, 1, 2, 3], 'column3':[1, 2, 3, 4, 5, 5, 6, 2, 5, 9, 2, 4, 9, 4, 2, 5, 4, 4, 1, 6, 1, 2, 3]})

In [93]:
data1

Unnamed: 0,column1,column2,column3
0,1,1,1
1,2,2,2
2,3,3,3
3,4,4,4
4,5,5,5
5,5,5,5
6,6,6,6
7,2,2,2
8,5,5,5
9,9,9,9


***

### How would you view such large dataframe?

### To view the top 5 rows 

In [94]:
data.head()

Unnamed: 0,column1,column2,column3
0,1,1,2
1,2,3,4
2,3,5,6
3,4,7,8
4,5,9,10


### To view the bottom 5 rows 

In [95]:
data.tail()

Unnamed: 0,column1,column2,column3
0,1,1,2
1,2,3,4
2,3,5,6
3,4,7,8
4,5,9,10


### To view the top 2 rows 

In [96]:
data.head(2)

Unnamed: 0,column1,column2,column3
0,1,1,2
1,2,3,4


### To view the bottom 6 rows 

In [97]:
data.tail(6)

Unnamed: 0,column1,column2,column3
0,1,1,2
1,2,3,4
2,3,5,6
3,4,7,8
4,5,9,10


### To find out about the number of rows and columns of data

In [98]:
data.shape

(5, 3)

### The describe function gives us the numerical statistics of the data

In [99]:
data.describe()

Unnamed: 0,column1,column2,column3
count,5.0,5.0,5.0
mean,3.0,5.0,6.0
std,1.581139,3.162278,3.162278
min,1.0,1.0,2.0
25%,2.0,3.0,4.0
50%,3.0,5.0,6.0
75%,4.0,7.0,8.0
max,5.0,9.0,10.0


### The info function tells about the information about the columns of the data

In [100]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   column1  5 non-null      int64
 1   column2  5 non-null      int64
 2   column3  5 non-null      int64
dtypes: int64(3)
memory usage: 252.0 bytes


***

## Series in Pandas

In [101]:
series = pd.Series([1, 2, 3, 4, 5])

In [102]:
series

0    1
1    2
2    3
3    4
4    5
dtype: int64

***

In [103]:
data

Unnamed: 0,column1,column2,column3
0,1,1,2
1,2,3,4
2,3,5,6
3,4,7,8
4,5,9,10


### To view a specific column in our data

In [104]:
data['column1']

0    1
1    2
2    3
3    4
4    5
Name: column1, dtype: int64

In [105]:
data['column2']

0    1
1    3
2    5
3    7
4    9
Name: column2, dtype: int64

### To view more than one selective columns from our data, we have to pass a list of column names

In [106]:
data[['column1', 'column2']]

Unnamed: 0,column1,column2
0,1,1
1,2,3
2,3,5
3,4,7
4,5,9


In [107]:
data

Unnamed: 0,column1,column2,column3
0,1,1,2
1,2,3,4
2,3,5,6
3,4,7,8
4,5,9,10


***

### To view a specific row of our data

In [108]:
data.iloc[0]

column1    1
column2    1
column3    2
Name: 0, dtype: int64

In [109]:
data.loc[1]

column1    2
column2    3
column3    4
Name: 1, dtype: int64

***

## What is the difference between 'iloc' and 'loc'?

***

### To find out about the null values of our data

In [110]:
data.isnull()

Unnamed: 0,column1,column2,column3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


### To find out about the null values of data column wise

In [111]:
data.isnull().sum()

column1    0
column2    0
column3    0
dtype: int64

### To find out about the overall null values of data

In [112]:
data.isnull().sum().sum()

0

***

## Practice

In [113]:
d = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, None, 30, 35, None],
    'City': ['New York', 'Los Angeles', None, 'Chicago', 'Houston'],
    'Score': [85.0, 90.5, None, 88.0, 92.0]
}

data2 = pd.DataFrame(d)

In [114]:
data2

Unnamed: 0,Name,Age,City,Score
0,Alice,25.0,New York,85.0
1,Bob,,Los Angeles,90.5
2,Charlie,30.0,,
3,David,35.0,Chicago,88.0
4,Eve,,Houston,92.0


In [115]:
data2.isnull()

Unnamed: 0,Name,Age,City,Score
0,False,False,False,False
1,False,True,False,False
2,False,False,True,True
3,False,False,False,False
4,False,True,False,False


In [116]:
data2.isnull().sum()

Name     0
Age      2
City     1
Score    1
dtype: int64

In [117]:
data2.isnull().sum().sum()

4

***

## I will be practicing some questions

## Create a DataFrame from a dictionary of lists.

In [118]:
pandas_dataframe = pd.DataFrame({'mycol1':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'mycol2':[12, 23, 45, 23, 23, 90, 78, 67, 56, 45], 'mycol3':[7, 6, 5, 0, 8, 2, 12, 34, 56, 6]})

In [119]:
pandas_dataframe

Unnamed: 0,mycol1,mycol2,mycol3
0,1,12,7
1,2,23,6
2,3,45,5
3,4,23,0
4,5,23,8
5,6,90,2
6,7,78,12
7,8,67,34
8,9,56,56
9,10,45,6


## Select the first 3 rows of a DataFrame.

In [120]:
pandas_dataframe.head(3)

Unnamed: 0,mycol1,mycol2,mycol3
0,1,12,7
1,2,23,6
2,3,45,5


## Select the 'mycol2' column from a DataFrame.

In [121]:
pandas_dataframe['mycol2']

0    12
1    23
2    45
3    23
4    23
5    90
6    78
7    67
8    56
9    45
Name: mycol2, dtype: int64

## Filter rows based on a column condition.

In [122]:
pandas_dataframe[pandas_dataframe['mycol1']>5]

Unnamed: 0,mycol1,mycol2,mycol3
5,6,90,2
6,7,78,12
7,8,67,34
8,9,56,56
9,10,45,6


## Add a new column to an existing DataFrame.

In [123]:
pandas_dataframe

Unnamed: 0,mycol1,mycol2,mycol3
0,1,12,7
1,2,23,6
2,3,45,5
3,4,23,0
4,5,23,8
5,6,90,2
6,7,78,12
7,8,67,34
8,9,56,56
9,10,45,6


In [124]:
pandas_dataframe['mycol4'] = [34, 45, 67, 45, 34, 12, 12, 56, 12, 11]

In [125]:
pandas_dataframe

Unnamed: 0,mycol1,mycol2,mycol3,mycol4
0,1,12,7,34
1,2,23,6,45
2,3,45,5,67
3,4,23,0,45
4,5,23,8,34
5,6,90,2,12
6,7,78,12,12
7,8,67,34,56
8,9,56,56,12
9,10,45,6,11


## Remove a column from a DataFrame.

In [126]:
pandas_dataframe.drop('mycol4', axis=1, inplace = True)

In [127]:
pandas_dataframe

Unnamed: 0,mycol1,mycol2,mycol3
0,1,12,7
1,2,23,6
2,3,45,5
3,4,23,0
4,5,23,8
5,6,90,2
6,7,78,12
7,8,67,34
8,9,56,56
9,10,45,6


## Sort a DataFrame by a column.