Pandas is a powerful Python library used for data manipulation and analysis, offering easy-to-use data structures like Series (one-dimensional) and DataFrame (two-dimensional). It allows users to load data from various formats such as CSV and Excel, clean and filter data, perform statistical operations, and analyze datasets efficiently. Widely used in data science and machine learning, Pandas simplifies handling structured data and supports tasks like grouping, merging, and time series analysis.

In [31]:
import pandas as pd

In [None]:
# Creating the boston house price data
from sklearn.datasets import fetch_california_housing
california_housing_dataset = fetch_california_housing()
type(california_housing_dataset)


In [None]:
print(california_housing_dataset)

In [None]:
# Convert into pandas dataframe
california_housing_df = pd.DataFrame(california_housing_dataset.data, columns=california_housing_dataset.feature_names)
california_housing_df.head()

In [None]:
california_housing_df.shape

In [None]:
type(california_housing_df)

In [36]:
# Importing the data from a CSV file to a pandas DataFrame
diabetes_df = pd.read_csv('/content/diabetes.csv')

In [None]:
type(diabetes_df)

In [None]:
diabetes_df.head()

In [40]:
# Exporting dataFrame to a csv file
california_housing_df.to_csv('california.csv')

Exporting dataFrame to a excel file
df.to_excel('filename')

In [None]:
# Creating dataframe with random values

import numpy as np
random_df = pd.DataFrame(np.random.rand(20,10))
random_df.head()

In [None]:
random_df.shape
type(random_df)

In [None]:
# Inspecting a DataFrame
#print last five rows in a DataFrame
california_housing_df.tail()

In [None]:
california_housing_df.info()

In [None]:
# Finding the number of missing values in each coloumn
california_housing_df.isnull().sum()

In [None]:
# diabetes dataframe
diabetes_df.head()

In [None]:
# Counting the values based on labels
diabetes_df.value_counts('Outcome')

In [None]:
# Group the values based on the mean
diabetes_df.groupby('Outcome').mean()

# Statistical Measures

In [58]:
# Count or numbers of values

california_housing_df.count()

Unnamed: 0,0
MedInc,20640
HouseAge,20640
AveRooms,20640
AveBedrms,20640
Population,20640
AveOccup,20640
Latitude,20640
Longitude,20640


In [59]:
# Mean value - column wise
california_housing_df.mean()


Unnamed: 0,0
MedInc,3.870671
HouseAge,28.639486
AveRooms,5.429
AveBedrms,1.096675
Population,1425.476744
AveOccup,3.070655
Latitude,35.631861
Longitude,-119.569704


In [60]:
# Standard deviation - column wise

california_housing_df.std()

Unnamed: 0,0
MedInc,1.899822
HouseAge,12.585558
AveRooms,2.474173
AveBedrms,0.473911
Population,1132.462122
AveOccup,10.38605
Latitude,2.135952
Longitude,2.003532


In [63]:
# Minimum value in each column

california_housing_df.min()

Unnamed: 0,0
MedInc,0.4999
HouseAge,1.0
AveRooms,0.846154
AveBedrms,0.333333
Population,3.0
AveOccup,0.692308
Latitude,32.54
Longitude,-124.35


In [64]:
# Maximum value for each column
california_housing_df.max()

Unnamed: 0,0
MedInc,15.0001
HouseAge,52.0
AveRooms,141.909091
AveBedrms,34.066667
Population,35682.0
AveOccup,1243.333333
Latitude,41.95
Longitude,-114.31


In [65]:
# all the statistical measures about the dataframe
california_housing_df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


# Manipulating a dataframe

In [68]:
# Adding column to a dataframe
california_housing_df['Price'] = california_housing_dataset.target

In [None]:
california_housing_df.head()

In [None]:
# Removing a row from dataframe
california_housing_df.drop(index=0, axis=0)

In [None]:
# Remove a column
california_housing_df.drop(columns='Price', axis=1)

In [None]:
# Locating a row using index value
california_housing_df.iloc[2]



In [None]:
# Locating a particular column
print(california_housing_df.iloc[:,0]) # first column
print(california_housing_df.iloc[:,-1]) # last column

# Correlation
1. Positive correlation
2. Negative correlation

In [78]:
california_housing_df.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
Price,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0
