<a href="https://colab.research.google.com/github/RaunakRaj2081/python_Language/blob/main/9_pandas_tutorial_in_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Pandas Library

🧠 What is Pandas?

Pandas is a Python library used for data analysis and data manipulation. It helps you to load, clean, analyze, and visualize data easily using tables (called DataFrames).

✅ Why Use Pandas?

1. Handles data in rows and columns (like Excel or CSV).

2. Easy to filter, sort, group, and analyze.

3. Can handle large datasets efficiently.

4. Useful for cleaning dirty data (missing values, duplicates, etc.).

5. Works well with NumPy and other libraries.

📄 Main Pandas Data Types

Series → 1D labeled array (like a column)

DataFrame → 2D labeled data table (like an Excel sheet)

In [1]:
# importing the pandas library
import pandas as pd
import numpy as np


#Create a Series

In [2]:
data = pd.Series([10, 20, 30, 40])
print(data)

0    10
1    20
2    30
3    40
dtype: int64


#Creating a pandas DataFrame

In [3]:
info = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["Delhi", "Mumbai", "Chennai"]
}

df = pd.DataFrame(info)
print(df)

      Name  Age     City
0    Alice   25    Delhi
1      Bob   30   Mumbai
2  Charlie   35  Chennai


In [4]:
from sklearn.datasets import fetch_california_housing

In [5]:
boston_dataset = fetch_california_housing()
type(boston_dataset)

In [None]:
print(boston_dataset)

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]]), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset:\n\nCalifornia Housing dataset\n-

In [9]:
# pandas DataFrame
boston_df = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)

In [None]:
boston_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [None]:
boston_df.shape
type(boston_df)

#Importing the data from a CSV file to a pandas DataFrame

In [None]:
diabetes_df = pd.read_csv("/content/diabetes.csv")

In [None]:
type(diabetes_df)

In [None]:
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
diabetes_df.shape

(768, 9)

Loading the data from a excel file to a Pandas DataFrame:

pd.read_excel('file path')

#Exporting a DataFrame to a csv file

In [10]:
boston_df.to_csv('boston_data.csv')

Exporting the Pandas DataFrame to an excel File:

df.to_excel('filename')

#creating a dataframe with random values

In [None]:
random_df = pd.DataFrame(np.random.rand(20,10))

In [None]:
random_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.485932,0.963021,0.124189,0.929873,0.082726,0.932521,0.901457,0.499048,0.600016,0.218564
1,0.345257,0.799451,0.814221,0.742259,0.849183,0.045195,0.2738,0.669034,0.480755,0.179714
2,0.510572,0.160479,0.669917,0.20943,0.768443,0.109334,0.721884,0.258719,0.300922,0.58887
3,0.970947,0.788355,0.284823,0.823837,0.334782,0.937458,0.283345,0.380418,0.721629,0.952777
4,0.724343,0.446835,0.693337,0.135379,0.738803,0.044681,0.903897,0.02076,0.252644,0.986456


In [None]:
random_df.shape

(20, 10)

#Inspecting a DataFrame

In [11]:
#number of rows and columns
boston_df.shape

(20640, 8)

In [12]:
boston_df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')

In [None]:
#first 5 rows in a DataFrame
boston_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [None]:
#last 5 rows in a DataFrame
boston_df.tail()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32
20639,2.3886,16.0,5.254717,1.162264,1387.0,2.616981,39.37,-121.24


In [None]:
# informations about the DataFrame
boston_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [None]:
# finding the number of missing values
boston_df.isnull().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0


In [None]:
# diabetes dataframe
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# counting the values based on the labels
diabetes_df.value_counts('Outcome')

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [None]:
# group the values based on the mean
diabetes_df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


#Statistical Measures

In [None]:
# count or number of values
boston_df.count()

Unnamed: 0,0
MedInc,20640
HouseAge,20640
AveRooms,20640
AveBedrms,20640
Population,20640
AveOccup,20640
Latitude,20640
Longitude,20640


In [None]:
# mean value - column wise
boston_df.mean()

Unnamed: 0,0
MedInc,3.870671
HouseAge,28.639486
AveRooms,5.429
AveBedrms,1.096675
Population,1425.476744
AveOccup,3.070655
Latitude,35.631861
Longitude,-119.569704


In [None]:
# standard deviation - column wise
boston_df.std()

Unnamed: 0,0
MedInc,1.899822
HouseAge,12.585558
AveRooms,2.474173
AveBedrms,0.473911
Population,1132.462122
AveOccup,10.38605
Latitude,2.135952
Longitude,2.003532


In [None]:
# minimum value
boston_df.min()

Unnamed: 0,0
MedInc,0.4999
HouseAge,1.0
AveRooms,0.846154
AveBedrms,0.333333
Population,3.0
AveOccup,0.692308
Latitude,32.54
Longitude,-124.35


In [None]:
# maximum value
boston_df.max()

Unnamed: 0,0
MedInc,15.0001
HouseAge,52.0
AveRooms,141.909091
AveBedrms,34.066667
Population,35682.0
AveOccup,1243.333333
Latitude,41.95
Longitude,-114.31


In [None]:
# all the statistical measures about the dataframe
boston_df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


#Manipulating a DataFrame

In [None]:
# adding a column to a dataframe
boston_df['Price'] = boston_dataset.target

In [None]:
boston_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
# removing a row
boston_df.drop(index=0, axis=0) # axis = 0 -> for removing a row and axis = 1 -> for removing a column

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25,2.697
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [None]:
# drop a column
boston_df.drop(columns='AveBedrms', axis=1)
# this drop method removes the row and column temporarily , if you want the deleted version store it in another DataFrame and use it

Unnamed: 0,MedInc,HouseAge,AveRooms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,741.0,2.123209,39.43,-121.32,0.847


In [None]:
# locating a row using the index value
boston_df.iloc[2]

Unnamed: 0,2
MedInc,7.2574
HouseAge,52.0
AveRooms,8.288136
AveBedrms,1.073446
Population,496.0
AveOccup,2.80226
Latitude,37.85
Longitude,-122.24
Price,3.521


In [None]:
# locating a particular column
print(boston_df.iloc[:,0])  # first column
print(boston_df.iloc[:,1])  # second column
print(boston_df.iloc[:,2])  # third column
print(boston_df.iloc[:,-1]) # last column

0        8.3252
1        8.3014
2        7.2574
3        5.6431
4        3.8462
          ...  
20635    1.5603
20636    2.5568
20637    1.7000
20638    1.8672
20639    2.3886
Name: MedInc, Length: 20640, dtype: float64
0        41.0
1        21.0
2        52.0
3        52.0
4        52.0
         ... 
20635    25.0
20636    18.0
20637    17.0
20638    18.0
20639    16.0
Name: HouseAge, Length: 20640, dtype: float64
0        6.984127
1        6.238137
2        8.288136
3        5.817352
4        6.281853
           ...   
20635    5.045455
20636    6.114035
20637    5.205543
20638    5.329513
20639    5.254717
Name: AveRooms, Length: 20640, dtype: float64
0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: Price, Length: 20640, dtype: float64


#Sorting Data


In [16]:
ps = boston_df.sort_values("HouseAge")
ps.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
12286,1.625,1.0,3.0,1.0,8.0,4.0,33.86,-116.95
19536,4.25,1.0,20.125,2.928571,402.0,3.589286,37.65,-120.93
3130,4.875,1.0,5.533333,1.0,32.0,2.133333,35.08,-117.95
18972,5.2636,1.0,7.69403,1.279851,872.0,3.253731,38.23,-122.0
12285,3.0417,2.0,7.040816,1.263265,1950.0,1.989796,33.86,-116.89


In [17]:
nd = boston_df.sort_values("HouseAge", ascending=False)  # Descending order
nd.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
31,1.9615,52.0,4.882086,1.090703,1168.0,2.648526,37.84,-122.28


# correlation

1. Positive Correlation -> one values increases so the other also increases
2. Negative Correlation -> one values decreases so the other also decreases

In [None]:
boston_df.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
Price,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


#Merging / Joining DataFrames

In [19]:
df1 = pd.DataFrame({
    "ID": [1, 2, 3],
    "Name": ["A", "B", "C"]
})

df2 = pd.DataFrame({
    "ID": [1, 2, 3],
    "Marks": [80, 90, 85]
})

merged = pd.merge(df1, df2, on="ID")
print(merged)

   ID Name  Marks
0   1    A     80
1   2    B     90
2   3    C     85
