In [None]:
%pip install pandas

In [4]:
# Pandas Basics

###This notebook covers essential Pandas concepts used in data cleaning, analysis, and preprocessing for machine learning.


In [5]:
import pandas as pd


In [None]:
## Pandas Series
##A Series is a one-dimensional labeled array capable of holding any data type.


In [6]:
s = pd.Series([10, 20, 30, 40])
s

0    10
1    20
2    30
3    40
dtype: int64

In [None]:
## Pandas DataFrame

##A DataFrame is a two-dimensional labeled data structure similar to a table in a database or spreadsheet.

In [7]:
df = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [22, 25, 30],
    "Department": ["AI", "ML", "Data"]
})
df


Unnamed: 0,Name,Age,Department
0,Alice,22,AI
1,Bob,25,ML
2,Charlie,30,Data


In [None]:
## Reading Data

##Pandas can read data from various file formats such as CSV and Excel.


In [None]:
# Example (commented as no file is attached)
# df = pd.read_csv("data.csv")

In [None]:
## Inspecting Data

##Understanding the structure and content of data is essential before analysis.

In [9]:
df.head()


Unnamed: 0,Name,Age,Department
0,Alice,22,AI
1,Bob,25,ML
2,Charlie,30,Data


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        3 non-null      object
 1   Age         3 non-null      int64 
 2   Department  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes


In [11]:
df.describe()

Unnamed: 0,Age
count,3.0
mean,25.666667
std,4.041452
min,22.0
25%,23.5
50%,25.0
75%,27.5
max,30.0


In [None]:
## Selecting Columns

##Columns can be selected using column names.


In [13]:
df["Age"]


0    22
1    25
2    30
Name: Age, dtype: int64

In [14]:
df[["Name", "Age"]]

Unnamed: 0,Name,Age
0,Alice,22
1,Bob,25
2,Charlie,30


In [None]:
## Selecting Rows

##Rows can be selected using label-based and index-based selection.


In [15]:
df.loc[0]

Name          Alice
Age              22
Department       AI
Name: 0, dtype: object

In [16]:
df.iloc[1]

Name          Bob
Age            25
Department     ML
Name: 1, dtype: object

In [None]:
## Filtering Data

##Filtering allows selecting rows based on conditions.

In [17]:
df[df["Age"] > 23]

Unnamed: 0,Name,Age,Department
1,Bob,25,ML
2,Charlie,30,Data


In [None]:
## Handling Missing Values

##Missing values must be handled before training machine learning models.


In [18]:
df["Salary"] = [5000, None, 7000]
df.isnull()
df.fillna(0)

Unnamed: 0,Name,Age,Department,Salary
0,Alice,22,AI,5000.0
1,Bob,25,ML,0.0
2,Charlie,30,Data,7000.0


In [None]:
## Sorting Data

## Sorting helps organize and analyze data efficiently.


In [19]:
df.sort_values(by="Age")

Unnamed: 0,Name,Age,Department,Salary
0,Alice,22,AI,5000.0
1,Bob,25,ML,
2,Charlie,30,Data,7000.0


In [None]:
## Adding Columns

New columns can be created for feature engineering.

In [20]:
df["Age_Group"] = df["Age"].apply(lambda x: "Adult" if x >= 25 else "Young")
df

Unnamed: 0,Name,Age,Department,Salary,Age_Group
0,Alice,22,AI,5000.0,Young
1,Bob,25,ML,,Adult
2,Charlie,30,Data,7000.0,Adult


In [None]:
## GroupBy Operations

GroupBy is used to aggregate data based on categories.

In [21]:
df.groupby("Department")["Age"].mean()

Department
AI      22.0
Data    30.0
ML      25.0
Name: Age, dtype: float64

In [None]:
## Value Counts

Counts the frequency of unique values in a column.

In [22]:
df["Department"].value_counts()

Department
AI      1
ML      1
Data    1
Name: count, dtype: int64

In [None]:
## Merging DataFrames

Merging combines multiple DataFrames using a common column.

In [23]:
df2 = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie"],
    "Salary": [5000, 6000, 7000]
})

pd.merge(df, df2, on="Name")


Unnamed: 0,Name,Age,Department,Salary_x,Age_Group,Salary_y
0,Alice,22,AI,5000.0,Young,5000
1,Bob,25,ML,,Adult,6000
2,Charlie,30,Data,7000.0,Adult,7000


In [None]:
## Exporting Data

Cleaned data can be saved for machine learning models.

In [24]:
df.to_csv("cleaned_data.csv", index=False)

In [None]:
## Summary

This notebook covered essential Pandas operations including data loading, inspection, cleaning, filtering, grouping, and exporting.
