<a href="https://colab.research.google.com/github/RixzFahad/Met-Data-Science-And-Analyst/blob/main/Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***PANDAS***

***PANDAS: INTRODUCTION***


In [None]:

# Pandas is a Python library used for:
# - Data analysis
# - Data manipulation
# - Data cleaning
# - Handling structured data (tables)

# WHY NOT PURE PYTHON / NUMPY?
# - Python lists are slow for large data
# - NumPy is fast but lacks labels & mixed data types
# - Pandas provides:
#   ✔ Labels (index & columns)
#   ✔ Missing value handling
#   ✔ Powerful filtering & grouping
#   ✔ Easy file reading (CSV, Excel, SQL)

# Pandas is built on top of NumPy

import pandas as pd
import numpy as np

# Check pandas version
print(pd.__version__)


In [1]:
pip install pandas



In [2]:
import pandas as pd

# ***How Many Data Structure Do We Have In Pandas***



*   Series: One - D Labeled Array
*   DataFrame: 2D Tabular Data
*   Panel: Multidimensional Array



In [4]:
#Create A One D Arrays
import numpy as np
A = np.array([1,2,3,4,5])
print(A)

[1 2 3 4 5]


In [12]:
# ================================
# PANDAS DATA STRUCTURES
# ================================

# Pandas has TWO MAIN data structures:
# 1. Series   → 1D labeled array
# 2. DataFrame → 2D table (rows & columns)

# --------------------------------
# SERIES (1D)
# --------------------------------

# Create a NumPy array
arr = np.array([10, 20, 30, 40])

# Convert NumPy array to Pandas Series
s = pd.Series(arr)

print("Pandas Series:")
print(s)

# Series has:
# - Values
# - Index (labels)
print("Values:", s.values)
print("Index:", s.index)

# Custom index
s_custom = pd.Series(arr, index=['a', 'b', 'c', 'd'])
print("\nSeries with Custom Index:")
print(s_custom)

# --------------------------------
# DATAFRAME (2D)
# --------------------------------

# Create DataFrame from dictionary
data = {
    "Name": ["Rixz", "Alex", "John"],
    "Age": [22, 25, 30],
    "City": ["Delhi", "London", "New York"]
}

df = pd.DataFrame(data)

print("\nPandas DataFrame:")
print(df)

# DataFrame properties
print("\nColumns:", df.columns)
print("Index:", df.index)
print("Shape:", df.shape)


Pandas Series:
0    10
1    20
2    30
3    40
dtype: int64
Values: [10 20 30 40]
Index: RangeIndex(start=0, stop=4, step=1)

Series with Custom Index:
a    10
b    20
c    30
d    40
dtype: int64

Pandas DataFrame:
   Name  Age      City
0  Rixz   22     Delhi
1  Alex   25    London
2  John   30  New York

Columns: Index(['Name', 'Age', 'City'], dtype='object')
Index: RangeIndex(start=0, stop=3, step=1)
Shape: (3, 3)


In [8]:
Y = pd.Series(A)
print(Y)


0    1
1    2
2    3
3    4
4    5
dtype: int64


In [9]:
# Create An Series By Set's
st = {1,2,3,4,5}
sr = pd.Series(st)
print(sr)
# We Cant Do It Coz Set Are Unordered

TypeError: 'set' type is unordered

In [11]:
# Create a dataframe from below list
li = [1,2,3,4,5]
df = pd.DataFrame(li)
df

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5


In [13]:
#Create A DataFrame Of Set Of Fruit's
F = {'Apple', 'Kiwi', 'Litchi'}
df = pd.DataFrame(F)
print(df)

        0
0   Apple
1    Kiwi
2  Litchi


In [16]:
#Import A File

df  = pd.read_csv("/content/sample_data/california_housing_train.csv")
df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [17]:
#export And Convert The DataFRame Into Wxcel Format

df.to_excel("data.xlsx")

In [20]:
df = pd.read_excel("/content/data.xlsx")
print(df.head())

   Unnamed: 0  longitude  latitude  housing_median_age  total_rooms  \
0           0    -114.31     34.19                  15         5612   
1           1    -114.47     34.40                  19         7650   
2           2    -114.56     33.69                  17          720   
3           3    -114.57     33.64                  14         1501   
4           4    -114.57     33.57                  20         1454   

   total_bedrooms  population  households  median_income  median_house_value  
0            1283        1015         472         1.4936               66900  
1            1901        1129         463         1.8200               80100  
2             174         333         117         1.6509               85700  
3             337         515         226         3.1917               73400  
4             326         624         262         1.9250               65500  


In [21]:
#Convert file Into Json Format
df.to_json("data.json")

In [24]:
#Reading the json
df3 = pd.read_json("/content/data.json")
df3.head(7)

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,0,-114.31,34.19,15,5612,1283,1015,472,1.4936,66900
1,1,-114.47,34.4,19,7650,1901,1129,463,1.82,80100
2,2,-114.56,33.69,17,720,174,333,117,1.6509,85700
3,3,-114.57,33.64,14,1501,337,515,226,3.1917,73400
4,4,-114.57,33.57,20,1454,326,624,262,1.925,65500
5,5,-114.58,33.63,29,1387,236,671,239,3.3438,74000
6,6,-114.58,33.61,25,2907,680,1841,633,2.6768,82400


In [25]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=df3)

https://docs.google.com/spreadsheets/d/1Xi6OgzFcF9yYFctrAqn5BRZoLfVsvk0xi_TrItO_QeY/edit#gid=0


***Data Frame Function's:-***

In [26]:
df = pd.read_csv("/content/sample_data/california_housing_train.csv")
#What Is The Size Of Df
df.size

153000

In [27]:
#Check the Row's And Col's In DataFrame
df.shape

(17000, 9)

In [28]:
#Check the All col from DataFRame
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [29]:
# Check the overall Information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  float64
 4   total_bedrooms      17000 non-null  float64
 5   population          17000 non-null  float64
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(9)
memory usage: 1.2 MB


In [30]:
#Check the DataType
df.dtypes
#In Numpy We Use "Dtype" But In Pandas We Need To Use "Dtypes"

Unnamed: 0,0
longitude,float64
latitude,float64
housing_median_age,float64
total_rooms,float64
total_bedrooms,float64
population,float64
households,float64
median_income,float64
median_house_value,float64


In [31]:
#Describe Overall Data
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [32]:
#Check last five row's from the data frame's
df.tail(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.3,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.3,41.8,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0
16999,-124.35,40.54,52.0,1820.0,300.0,806.0,270.0,3.0147,94600.0


In [34]:
#Check the null values in a dataframe's
df.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,0
population,0
households,0
median_income,0
median_house_value,0
