<a href="https://colab.research.google.com/github/Uzma-Jawed/python-class_work-and-practice/blob/main/pandas_DataFrames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


📘 Uzma Jawed
___

This Notebook covers foundational Pandas operations using small data examples and a sample .csv file.

Each section includes comments or code annotations for clarity.
___

Topics Covered:
* DataFrame creation

* Renaming columns

* Accessing columns and rows using .loc and .iloc

* Creating Series

* Custom indexing and columns

* Creating date ranges (dummy data)

* Loading and working with .csv files

* Applying conditions and filters

* Dropping columns and rows
___

In [None]:
import pandas as pd

In [None]:
# 1. Basic DataFrame creation

data_1 = {
    "name": ["aqsa"],
    "age": [23],
    "education": ["bachelors"]
}

In [None]:
df_1 = pd.DataFrame(data_1)

In [None]:
df_1

Unnamed: 0,First Name,Age,education
0,aqsa,23,bachelors


In [None]:
# 2. Renaming columns

df_1.rename(columns={'name':'First Name', 'age': 'Age'}, inplace=True)

In [None]:
df_1

Unnamed: 0,First Name,Age,education
0,aqsa,23,bachelors


In [None]:
# 3. Create another DataFrame

data_2 = {'col1': [1, 2, 3, 4, 7], 'col2': [4, 5, 6, 9, 5], 'col3': [7, 8, 12, 1, 11]}

In [None]:
df_2 = pd.DataFrame(data_2)

In [None]:
df_2

Unnamed: 0,col1,col2,col3
0,1,4,7
1,2,5,8
2,3,6,12
3,4,9,1
4,7,5,11


In [None]:
# 4. Access multiple columns by indexing

data_3 = {'Name': ['Uzma', 'Zaid', 'Hassan', 'Sheikh'],
         'Age': [25, 30, 35, 28],
         'City': ['New York', 'London', 'Paris', 'Tokyo']}

In [None]:
df_3 = pd.DataFrame(data_3)

In [None]:
df_3

Unnamed: 0,Name,Age,City
0,Uzma,25,New York
1,Zaid,30,London
2,Hassan,35,Paris
3,Sheikh,28,Tokyo


In [None]:
# Selecting 'Name' and 'Age' columns

df_3[['Name', 'Age']]

Unnamed: 0,Name,Age
0,Uzma,25
1,Zaid,30
2,Hassan,35
3,Sheikh,28


In [None]:
# View column names

df_3.columns

Index(['Name', 'Age', 'City'], dtype='object')

In [None]:
# 5. Creating a simple Series

pd.Series([1, 2, 3, 4])

Unnamed: 0,0
0,1
1,2
2,3
3,4


In [None]:
# 6. Create DataFrame with custom index and columns

data_4 = {
    "name": ["Aqsa", "Nimra"],
    "age": [23, 20],
    "education": ["Bachelors", "Bs"]
}

In [None]:
df_4 = pd.DataFrame(data_4, index=['a', 'b'], columns=['name', 'age', 'education'])

In [None]:
df_4

Unnamed: 0,name,age,education
a,Aqsa,23,Bachelors
b,Nimra,20,Bs


In [None]:
# Accessing column "name" — only valid if column exists
# Using df_4 from earlier step

df_4["name"]

Unnamed: 0,name
a,Aqsa
b,Nimra


In [None]:
# 7. Creating date range (monthly)

pd.date_range(start='2023-01-01', end='2023-12-31', freq='M')

  pd.date_range(start='2023-01-01', end='2023-12-31', freq='M')


DatetimeIndex(['2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31'],
              dtype='datetime64[ns]', freq='ME')

In [None]:
# Create a DataFrame with custom date index
data_5 = {
    "name": ["Urwa", "Zaid"],
    "age": [23, 24],
    "education": ["Masters", "PhD"]
}

In [None]:
date_index = pd.date_range('2015-03-02', periods=2)  # Corrected date format: 'YYYY-MM-DD'

In [None]:
df_5 = pd.DataFrame(data_5, index=date_index)

In [None]:
df_5

Unnamed: 0,name,age,education
2015-03-02,Urwa,23,Masters
2015-03-03,Zaid,24,PhD


In [None]:
# Create a date range starting from a specific date

pd.date_range('2025-07-12', periods=10)

DatetimeIndex(['2025-07-12', '2025-07-13', '2025-07-14', '2025-07-15',
               '2025-07-16', '2025-07-17', '2025-07-18', '2025-07-19',
               '2025-07-20', '2025-07-21'],
              dtype='datetime64[ns]', freq='D')

In [None]:
# Create 12 equally spaced dates between Jan 1, 2023, and Dec 31, 2023

date = pd.date_range(start='2023-01-01', end='2023-12-31', periods=12)

In [None]:
date

DatetimeIndex([          '2023-01-01 00:00:00',
               '2023-02-03 02:10:54.545454545',
               '2023-03-08 04:21:49.090909091',
               '2023-04-10 06:32:43.636363636',
               '2023-05-13 08:43:38.181818182',
               '2023-06-15 10:54:32.727272728',
               '2023-07-18 13:05:27.272727272',
               '2023-08-20 15:16:21.818181820',
               '2023-09-22 17:27:16.363636364',
               '2023-10-25 19:38:10.909090908',
               '2023-11-27 21:49:05.454545456',
                         '2023-12-31 00:00:00'],
              dtype='datetime64[ns]', freq=None)

In [None]:
# 8. Accessing rows using slicing

df_3[:2]

Unnamed: 0,Name,Age,City
0,Uzma,25,New York
1,Zaid,30,London


In [None]:
# Creating a DataFrame with multiple columns (repetition for reuse)

d = {'col1': [1, 2, 3, 4, 7], 'col2': [4, 5, 6, 9, 5], 'col3': [7, 8, 12, 1, 11]}

In [None]:
df_6 = pd.DataFrame(data=d)

In [None]:
df_6

Unnamed: 0,col1,col2,col3
0,1,4,7
1,2,5,8
2,3,6,12
3,4,9,1
4,7,5,11


In [None]:
# 9. loc and iloc examples

In [None]:
# loc accesses by label/index

df_3.loc[2]

Unnamed: 0,2
Name,Hassan
Age,35
City,Paris


In [None]:
# iloc accesses by integer position

df_3.iloc[2:5]

Unnamed: 0,Name,Age,City
2,Hassan,35,Paris
3,Sheikh,28,Tokyo


In [None]:
# iloc for specific row & column ranges

df_3.iloc[2:5, 1:3]

Unnamed: 0,Age,City
2,35,Paris
3,28,Tokyo


In [None]:
# 10. Load CSV file (California Housing Test data)

housing_df = pd.read_csv("/content/sample_data/california_housing_test.csv")
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [None]:
# Access latitude column using loc

housing_df.loc[2:5, ['latitude']]

Unnamed: 0,latitude
2,33.78
3,33.82
4,36.33
5,36.51


In [None]:
# Apply condition (median_income < 2)

housing_df[housing_df["median_income"] < 2]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
5,-119.56,36.51,37.0,1018.0,213.0,663.0,204.0,1.6635,67000.0
6,-121.43,38.63,43.0,1009.0,225.0,604.0,218.0,1.6641,67000.0
16,-120.81,37.53,15.0,570.0,123.0,189.0,107.0,1.8750,181300.0
28,-118.45,34.07,19.0,4845.0,1609.0,3751.0,1539.0,1.5830,350000.0
43,-117.27,34.09,36.0,848.0,186.0,737.0,169.0,0.9838,79300.0
...,...,...,...,...,...,...,...,...,...
2943,-121.23,37.96,37.0,2351.0,564.0,1591.0,549.0,1.6563,57200.0
2964,-118.25,33.94,43.0,1113.0,378.0,1305.0,334.0,1.1434,91300.0
2985,-120.47,34.94,17.0,1368.0,308.0,642.0,303.0,1.8633,109400.0
2986,-118.25,33.93,42.0,819.0,233.0,899.0,228.0,1.1346,85400.0


In [None]:
# Drop a column ('latitude')

housing_df.drop(["latitude"], axis=1).head()

Unnamed: 0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [None]:
# Drop a column ('latitude')

housing_df.drop(["latitude"], axis=1).head()

Unnamed: 0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [None]:
#  Drop a row (index 1)

housing_df.drop(1, axis=0).head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0
5,-119.56,36.51,37.0,1018.0,213.0,663.0,204.0,1.6635,67000.0


In [None]:
# Drop a row by index label or number (works on DataFrame, not plain dict)
# Drop row with index 1

housing_df.drop(1, axis=0).head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0
5,-119.56,36.51,37.0,1018.0,213.0,663.0,204.0,1.6635,67000.0



___
### Reminder:

* loc is used to access rows/columns by **label**
* iloc is used to access rows/columns by **index position**
___

In [None]:
# Reading CSV and Excel Files

In [None]:
# 🗂️ Reading a CSV file

# csv_df = pd.read_csv("/content/sample_data/california_housing_test.csv")
# csv_df.head()  # Display first 5 rows

In [None]:
# 📊 Reading an Excel file
# Replace 'your_file.xlsx' with the path to your Excel file
# Note: You may need to install openpyxl with: pip install openpyxl
# excel_df = pd.read_excel("your_file.xlsx")
# excel_df.head()