# __Data Frame Functions__

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'EmpID': [101, 102, 102, 104, 105, 106, 107, 108, 109, 110],
    'Name': ['Ali', 'Sara', None, 'Zara', 'Usman', 'Hina', 'Bilal', 'Nida', 'Omer', 'Ayesha'],
    'Dept': ['IT', 'hr', 'IT ', 'Finance', 'Sales', 'it', 'HR', 'Marketing', 'Finance', 'IT'],
    'Salary': ['45,000', '52000', '48k', '60,000', None, '47,000', '55,000', '62,000', '58000', '50k'],
    'Experience': [1, 5, np.nan, 7, 3, 1, 4, np.nan, 8, 2],
    'JoinDate': ['2021-01-10', '2018/05/20', None, '2016-03-15', '2019-07-01', 
                 '2022-02-11', '2017/08/30', '2015-06-05', '2014-11-25', '2020-09-01']
}




In [37]:
df = pd.DataFrame(data)

df['Dept'] = df['Dept'].str.upper().str.strip()
# Validation using .isin()
valid_Depts = ['IT', 'HR', 'FINANCE', 'MARKETING']
df.loc[~df['Dept'].isin(valid_Depts),'Dept']=None

# Using Replace Function
df=df.replace({'Salary':'k'},'000',regex=True)
df=df.replace({'Salary':','},'',regex=True)

# Using str.replace
df['Salary'] = df['Salary'].str.replace('k','000').str.replace(',','')
df['Salary']=pd.to_numeric(df['Salary'])
df['Salary']=df['Salary'].fillna(df['Salary'].median())
df['Salary']=df['Salary'].astype(int)

df['Salary']

globalExpMean=df['Experience'].mean()

s=df.groupby('Dept')['Experience'].transform('mean')
df['Experience']=df['Experience'].fillna(s)
df['Experience']=df['Experience'].fillna(globalExpMean)
df['Experience']=df['Experience'].round(2)
df

df['JoinDate']=df['JoinDate'].str.replace('/','-')
df['JoinDate']=pd.to_datetime(df['JoinDate'])

df.dropna(subset=['Name'])
df.reset_index(drop=True)

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000,1.0,2021-01-10
1,102,Sara,HR,52000,5.0,2018-05-20
2,102,,IT,48000,1.33,NaT
3,104,Zara,FINANCE,60000,7.0,2016-03-15
4,105,Usman,,52000,3.0,2019-07-01
5,106,Hina,IT,47000,1.0,2022-02-11
6,107,Bilal,HR,55000,4.0,2017-08-30
7,108,Nida,MARKETING,62000,3.88,2015-06-05
8,109,Omer,FINANCE,58000,8.0,2014-11-25
9,110,Ayesha,IT,50000,2.0,2020-09-01


---
 ## __1. Creation & Basic Info (Understand Your Data)__


###  pd.DataFrame() Create DataFrame From dict, list, ndarray 

In [4]:

df = pd.DataFrame(data)

### df.head(n) First n rows Quick preview

In [5]:
df.head(4)

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000,1.0,2021-01-10
1,102,Sara,hr,52000,5.0,2018/05/20
2,102,,IT,48k,,
3,104,Zara,Finance,60000,7.0,2016-03-15


### df.tail(n) Last n rows Check end

In [6]:
df.tail(5)

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
5,106,Hina,it,47000,1.0,2022-02-11
6,107,Bilal,HR,55000,4.0,2017/08/30
7,108,Nida,Marketing,62000,,2015-06-05
8,109,Omer,Finance,58000,8.0,2014-11-25
9,110,Ayesha,IT,50k,2.0,2020-09-01


### df.sample(n) Random rows Spot-check data 

In [7]:
df.sample(5)

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
6,107,Bilal,HR,55000,4.0,2017/08/30
4,105,Usman,Sales,,3.0,2019-07-01
9,110,Ayesha,IT,50k,2.0,2020-09-01
0,101,Ali,IT,45000,1.0,2021-01-10
2,102,,IT,48k,,


### df.shape : (rows, cols) Dataset size

In [8]:
df.shape

(10, 6)

### df.size :Total cells Memory idea 


In [9]:
df.size

60

### df.ndim Dimensions Usually 2

In [10]:
df.ndim

2

### df.info() Columns, dtypes, nulls MOST IMPORTANT 

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   EmpID       10 non-null     int64  
 1   Name        9 non-null      object 
 2   Dept        10 non-null     object 
 3   Salary      9 non-null      object 
 4   Experience  8 non-null      float64
 5   JoinDate    9 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 612.0+ bytes


### df.describe() Stats summary Numeric overview 

In [12]:
df.describe()

Unnamed: 0,EmpID,Experience
count,10.0,8.0
mean,105.4,3.875
std,3.134042,2.642374
min,101.0,1.0
25%,102.5,1.75
50%,105.5,3.5
75%,107.75,5.5
max,110.0,8.0


### df.columns Column names Rename, inspect

In [13]:
df.columns

Index(['EmpID', 'Name', 'Dept', 'Salary', 'Experience', 'JoinDate'], dtype='object')

### df.index Index object Reset/align


In [14]:
df.index

RangeIndex(start=0, stop=10, step=1)

---
## __2. Column & Index Operations__

### df['col'] Select column (Series) 


In [15]:
df['Name']

0       Ali
1      Sara
2      None
3      Zara
4     Usman
5      Hina
6     Bilal
7      Nida
8      Omer
9    Ayesha
Name: Name, dtype: object

### df[['c1','c2']] Multiple columns 


In [16]:
df[['EmpID','Name']]


Unnamed: 0,EmpID,Name
0,101,Ali
1,102,Sara
2,102,
3,104,Zara
4,105,Usman
5,106,Hina
6,107,Bilal
7,108,Nida
8,109,Omer
9,110,Ayesha


### df.rename(columns/index={Name:NewName}) : Rename columns/index 

In [17]:
df.rename(columns={"JoinDate":"JoiningDate"})
df.rename(index={2:4})

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000,1.0,2021-01-10
1,102,Sara,hr,52000,5.0,2018/05/20
4,102,,IT,48k,,
3,104,Zara,Finance,60000,7.0,2016-03-15
4,105,Usman,Sales,,3.0,2019-07-01
5,106,Hina,it,47000,1.0,2022-02-11
6,107,Bilal,HR,55000,4.0,2017/08/30
7,108,Nida,Marketing,62000,,2015-06-05
8,109,Omer,Finance,58000,8.0,2014-11-25
9,110,Ayesha,IT,50k,2.0,2020-09-01


### df.set_index() Set column as index 


In [18]:
df.set_index("EmpID")

Unnamed: 0_level_0,Name,Dept,Salary,Experience,JoinDate
EmpID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
101,Ali,IT,45000,1.0,2021-01-10
102,Sara,hr,52000,5.0,2018/05/20
102,,IT,48k,,
104,Zara,Finance,60000,7.0,2016-03-15
105,Usman,Sales,,3.0,2019-07-01
106,Hina,it,47000,1.0,2022-02-11
107,Bilal,HR,55000,4.0,2017/08/30
108,Nida,Marketing,62000,,2015-06-05
109,Omer,Finance,58000,8.0,2014-11-25
110,Ayesha,IT,50k,2.0,2020-09-01


### df.reset_index(drop=True) Reset index 

In [19]:
df.reset_index(drop=True)

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000,1.0,2021-01-10
1,102,Sara,hr,52000,5.0,2018/05/20
2,102,,IT,48k,,
3,104,Zara,Finance,60000,7.0,2016-03-15
4,105,Usman,Sales,,3.0,2019-07-01
5,106,Hina,it,47000,1.0,2022-02-11
6,107,Bilal,HR,55000,4.0,2017/08/30
7,108,Nida,Marketing,62000,,2015-06-05
8,109,Omer,Finance,58000,8.0,2014-11-25
9,110,Ayesha,IT,50k,2.0,2020-09-01


### df.insert(index, "Name", value) Insert column at position 

In [20]:
age=pd.Series(np.random.randint(20,30,10))
df.insert(3,"Age",age)


### df.pop() Remove & return column

In [21]:
age=df.pop("Age")

### df.drop() Drop rows or columns

In [22]:
df.drop("Dept",axis=1)
df.drop(4,axis=0)

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000,1.0,2021-01-10
1,102,Sara,hr,52000,5.0,2018/05/20
2,102,,IT,48k,,
3,104,Zara,Finance,60000,7.0,2016-03-15
5,106,Hina,it,47000,1.0,2022-02-11
6,107,Bilal,HR,55000,4.0,2017/08/30
7,108,Nida,Marketing,62000,,2015-06-05
8,109,Omer,Finance,58000,8.0,2014-11-25
9,110,Ayesha,IT,50k,2.0,2020-09-01


---
## __3. Selection & Filtering__
    Label / Position Based 

### df.loc[index]['Col'] Label-based (SAFE) 

In [23]:
df.loc[0]['Name']
df.loc[0]


EmpID                101
Name                 Ali
Dept                  IT
Salary            45,000
Experience           1.0
JoinDate      2021-01-10
Name: 0, dtype: object

### df.iloc[index]['Col'] Position-based 


In [24]:
df.iloc[0]['Name']
df.iloc[0]

EmpID                101
Name                 Ali
Dept                  IT
Salary            45,000
Experience           1.0
JoinDate      2021-01-10
Name: 0, dtype: object

### df.at[row-index, col-index] Fast single value 

In [25]:
df.at[0,'Name']

'Ali'

### df.iat[row-index,col-Index] Fast positional value

In [26]:
df.iat[0,1]

'Ali'


### Boolean Filtering

| Pattern | Meaning |
| --- | --- |
| `df[df['col'] > x]` | Filter rows based on value |
| `df.loc[cond, cols]` | **SAFE** filter + modify (Selects specific rows/cols) |
| `(cond1) & (cond2)` | Bitwise **AND** (Both conditions must be true) |
| `(cond1) | (cond2)` |
| `~cond` | Bitwise **NOT** (Inverses the condition) |



In [27]:
df[df['Experience']>2]

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
1,102,Sara,hr,52000.0,5.0,2018/05/20
3,104,Zara,Finance,60000.0,7.0,2016-03-15
4,105,Usman,Sales,,3.0,2019-07-01
6,107,Bilal,HR,55000.0,4.0,2017/08/30
8,109,Omer,Finance,58000.0,8.0,2014-11-25


In [28]:
df.loc[df['Experience']>2,'Salary']=50000

In [29]:
df[(df['Experience']>2) & (df['JoinDate']<'2018-01-01')]

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
3,104,Zara,Finance,50000,7.0,2016-03-15
6,107,Bilal,HR,50000,4.0,2017/08/30
8,109,Omer,Finance,50000,8.0,2014-11-25


In [30]:
df[~(df['Dept']=='IT')]


Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
1,102,Sara,hr,50000,5.0,2018/05/20
2,102,,IT,48k,,
3,104,Zara,Finance,50000,7.0,2016-03-15
4,105,Usman,Sales,50000,3.0,2019-07-01
5,106,Hina,it,47000,1.0,2022-02-11
6,107,Bilal,HR,50000,4.0,2017/08/30
7,108,Nida,Marketing,62000,,2015-06-05
8,109,Omer,Finance,50000,8.0,2014-11-25


---
## __4. Modification / Assignment__

| Method | Purpose |
| --- | --- |
| `df.loc[cond, col] = val` | **Conditional update** (Modifies specific rows/cols in place) |
| `df[col] = ...` | **Create/Overwrite** an entire column |
| `df.assign()` | **Create new column safely** (Returns a new DataFrame with added Columns; original is untouched) |
| `df.update()` | **Update** values using data from another DataFrame (Modifies directly by indexes of rows) |


---

#### ‚ö†Ô∏è Best Practice: Avoiding "Chained Assignment"

Pandas cannot always guarantee whether you are modifying a **view** or a **copy** when you chain slice operations. Always use `.loc` for conditional updates.

**Avoid (WRONG)**

> `df[df['Stock'] < 50]['Price'] = 999`
> *Risk: May raise `SettingWithCopyWarning` and fail to update the original DataFrame.*

**Correct (SAFE)**

> `df.loc[df['Stock'] < 50, 'Price'] = 99`
> *Result: Explicitly targets the rows and column in a single step.*

---

In [31]:
df.loc[df['Dept']=='IT','Salary']*=1.10

TypeError: can't multiply sequence by non-int of type 'float'

In [None]:
df['Age']=np.random.randint(20,30,10)
df['Dept']=np.random.choice(['IT','HR','FINANCE'], size=10)

In [None]:
df.assign(
    Bonus_Eligible = lambda x: x['Experience'] > 3.0
)

In [None]:

corrections = pd.DataFrame({
    'Dept': {1: 'HR', 5: 'IT'},
    'Name': {2: 'Ahmed'}
})
df.update(corrections)

---
## __5. Missing Data Handling__

### df.isna() / df.isnull() : Detect NaNs 

In [None]:
df.isna()
df.isnull()

### df.notna() : Opposite (Only checks NOT NaN)

In [None]:
df.notna()

### df.dropna() :   Remove Rows/Columns containing missing values



### `dropna()` Parameters

| Parameter | Meaning |
| --- | --- |
| `axis=0` / `1` | Drop **rows** (0) or **columns** (1) |
| `how='any'` | Drop if **any** value is `NaN` (Default) |
| `how='all'` | Drop if **all** values are `NaN` |
| `thresh=n` | Keep rows/cols with **at least n** non-NaN values |
| `subset=[cols]` | Consider only these specific columns for `NaN` checks |

---



In [None]:
df.dropna()
df.dropna(axis=1)
df.dropna(how='all')
df.dropna(how='any')
df.dropna(thresh=2)
df.dropna(subset=['JoinDate'])

### df.fillna() : Fill missing 

In [None]:
df.fillna(0)

### df.interpolate() : Estimate values 


In [None]:
df.interpolate()

### df.replace() : Replace specific values

In [None]:
df.replace(to_replace='HR',value='CS')

---
## __6. String Operations (.str)__ 


### df['col'].str.lower() :  lowercase 

In [None]:
df['Name'].str.lower()

### str.upper() :  uppercase 

In [None]:
df['Name'].str.upper()

### str.strip() : remove spaces 

In [None]:
df['Name'].str.strip()

### str.replace() : regex replace

In [None]:
df['Salary'].str.replace(',','').str.replace('k','000')

### str.contains() :  filter text 

In [None]:
df['Name'].str.contains('Ali')

### str.startswith() : prefix 

In [None]:
df['Name'].str.startswith('A')


### str.endswith() :  suffix 


In [None]:
df['Name'].str.endswith('a')

### str.split() :  split string 

In [None]:
df['Name'].str.split("")

---
## __7. Type Conversion & Cleaning__

### df.astype() : Change dtype 

In [None]:
df['Experience'].astype(str)

### pd.to_numeric() : Clean numbers 

In [None]:
pd.to_numeric(df['Experience'])

### pd.to_datetime() : Parse dates 

In [None]:
df['JoinDate']=df['JoinDate'].str.replace('/','-')
pd.to_datetime(df['JoinDate'])

### df.convert_dtypes() : Smart dtype inference 

In [None]:
df.convert_dtypes()

---
## __8. Sorting & Ranking__ 




### df.sort_values(by=) : Sort rows 

In [None]:
df.sort_values(by='Experience',ascending=False)

### df.sort_index():  Sort index 

In [None]:
df.sort_index()

### df.rank(): Ranking

In [None]:
df.select_dtypes(include='number').rank()

### df.nlargest(n, col) :  Top values 

In [None]:
df.nlargest(3,'Salary')


### df.nsmallest(n, col) : Bottom values

In [None]:
df.nsmallest(3,'Salary')

---
## __9. Grouping & Aggregation__ 

### df.groupby() Group data 

In [None]:
df.groupby('Dept')['EmpID'].count()



### Pandas Aggregate Functions

| Method | Description |
| --- | --- |
| `df.mean()` | Returns the **average** of all values |
| `df.sum()` | Returns the **sum** of values |
| `df.count()` | Returns the count of **non-null** values |
| `df.median()` | Returns the **median** (middle value) |
| `df.min()` | Returns the **minimum** value |
| `df.max()` | Returns the **maximum** value |
| `df.std()` | Returns the **standard deviation** |
| `df.var()` | Returns the **variance** |
| `df.mode()` | Returns the **mode** (most frequent value) |
| `df.describe()` | Generates summary statistics (count, mean, std, min, 25%, 50%, 75%, max) |



In [None]:
df['EmpID'].count()
df['Experience'].sum()
df['Salary'].min()
df['Salary'].max()
df['Salary'].median()
df['Salary'].mean()
df.describe()

### agg(['sum','mean']): Multiple agg

In [None]:
df['Salary'].agg([ 'sum','mean','count'])

### transform() : Per-row group calc 

In [None]:
df.groupby('Dept')['Salary'].transform('mean')

### filter() :  Filter groups 

The `df.filter()` function in Pandas is used to select columns or rows based on their **names (labels)**, not their values.

> **Crucial Distinction:** `df.filter()` looks at **Column Headers** or **Index Labels**. It does *not* filter the actual data inside the cells (for that, use Boolean Filtering like `df[df['col'] > 5]`).

### `filter()` Parameters

| Parameter | Function | Example |
| --- | --- | --- |
| `items=[list]` | Selects columns with **exact** names | `items=['Name', 'Salary']` |
| `like='string'` | Selects columns containing this **substring** | `like='Sal'` (Matches 'Salary', 'Sales') |
| `regex='pattern'` | Selects columns matching a **Regex** pattern | `regex='e$'` (Ends with 'e') |
| `axis=0/1` | **1** for Columns (Default), **0** for Rows (Index) | `axis=0` filters row labels |



In [None]:
df.filter(items=['Name','Salary'])
df.filter(like='a')
df.filter(regex=',')
df.filter(items=['Name','Dept'],axis=1)

---
## __10. Duplicates & Uniqueness__  

###   __`df.duplicated()`__

**Purpose:**
Returns a **Boolean Series** indicating whether each row is a duplicate.

### Parameters

| Parameter | Default   | Description                                                                                                                                                  |
| --------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `subset`  | `None`    | Columns to consider when identifying duplicates. If `None`, all columns are used.                                                                            |
| `keep`    | `'first'` | `'first'` ‚Üí First occurrence is `False`, others `True`<br>`'last'` ‚Üí Last occurrence is `False`, others `True`<br>`False` ‚Üí **All** duplicates marked `True` |


In [None]:
df.duplicated(subset=['EmpID'])

###  __`df.drop_duplicates()`__

**Purpose:**
Removes duplicate rows and returns a cleaned DataFrame.

### Parameters

| Parameter      | Default   | Description                                                                                               |
| -------------- | --------- | --------------------------------------------------------------------------------------------------------- |
| `subset`       | `None`    | Columns to check for duplicates                                                                           |
| `keep`         | `'first'` | `'first'` ‚Üí Keep first occurrence<br>`'last'` ‚Üí Keep last occurrence<br>`False` ‚Üí Drop **all** duplicates |
| `inplace`      | `False`   | `True` ‚Üí Modify DataFrame directly<br>`False` ‚Üí Return new DataFrame                                      |
| `ignore_index` | `False`   | `True` ‚Üí Reset index after removal<br>`False` ‚Üí Keep original index                                       |


In [None]:
df.drop_duplicates(subset=['EmpID'])

###  __`df.nunique()`__

**Purpose:**
Counts **unique values** along a given axis.

### Parameters

| Parameter | Default | Description                                                                       |
| --------- | ------- | --------------------------------------------------------------------------------- |
| `axis`    | `0`     | `0` ‚Üí Count unique values **per column**<br>`1` ‚Üí Count unique values **per row** |
| `dropna`  | `True`  | `True` ‚Üí Ignore `NaN` values<br>`False` ‚Üí Count `NaN` as a unique value           |


In [None]:
df.nunique(axis=0)
# df.nunique(axis=1)

###  __`Series.unique()`__

**Purpose:**
Returns **unique values** from a single column.

### Notes

* Works **only on Series**, not DataFrames
* Returns a **NumPy array**
* Includes `NaN` values

In [None]:
df['Name'].unique()

---
## __11. Merge / Join / Combine__ 
#### üîç Quick Comparison Table

| Method            | Join Type   | Based On        | Common Use            |
| ----------------- | ----------- | --------------- | --------------------- |
| `pd.merge()`      | SQL-like    | Column          | Relational data       |
| `df.join()`       | Simple join | Index           | Fast index joins      |
| `pd.concat()`     | Stack       | Axis            | Append / combine data |
| `combine_first()` | Fill        | Index + columns | Missing data recovery |




###  `pd.merge()` ‚Äî **SQL-like joins**

#### Parameters

| Parameter   | Default       | Meaning                                   |
| ----------- | ------------- | ----------------------------------------- |
| `left`      | ‚Äî             | Left DataFrame                            |
| `right`     | ‚Äî             | Right DataFrame                           |
| `on`        | `None`        | Column(s) to join on                      |
| `how`       | `'inner'`     | `'inner'`, `'left'`, `'right'`, `'outer'` |
| `left_on`   | `None`        | Left column if names differ               |
| `right_on`  | `None`        | Right column if names differ              |
| `suffixes`  | `('_x','_y')` | Rename overlapping columns                |
| `indicator` | `False`       | Adds column showing source (`_merge`)     |



In [None]:
customers = pd.DataFrame({
    'cust_id': [1, 2, 3],
    'name': ['Ali', 'Sara', 'Usman']
})

orders = pd.DataFrame({
    'cust_id': [1, 1, 2, 4],
    'order_id': [101, 102, 103, 104]
})

pd.merge(customers, orders, on='cust_id', how='inner')


###  `df.join()` ‚Äî **Index-based join**

### üëâ Important Parameters

| Parameter | Default  | Meaning                                   |
| --------- | -------- | ----------------------------------------- |
| `other`   | ‚Äî        | DataFrame to join                         |
| `on`      | `None`   | Join key if not index                     |
| `how`     | `'left'` | `'left'`, `'right'`, `'inner'`, `'outer'` |
| `lsuffix` | `''`     | Left suffix for conflicts                 |
| `rsuffix` | `''`     | Right suffix for conflicts                |

üìå **Best when index is meaningful**



In [None]:
users = pd.DataFrame({
    'name': ['Ali', 'Sara', 'Usman']
}, index=[1, 2, 3])

scores = pd.DataFrame({
    'score': [85, 90]
}, index=[1, 3])

users.join(scores)

###  `pd.concat()` ‚Äî **Stack or append data**

### üëâ Important Parameters

| Parameter      | Default   | Meaning                   |
| -------------- | --------- | ------------------------- |
| `objs`         | ‚Äî         | List of DataFrames        |
| `axis`         | `0`       | `0` ‚Üí rows, `1` ‚Üí columns |
| `ignore_index` | `False`   | Reset index               |
| `join`         | `'outer'` | `'outer'` or `'inner'`    |
| `keys`         | `None`    | Create hierarchical index |



In [None]:
df1 = pd.DataFrame({
    'id': [1, 3],
    'name': ['Ali', 'Sara']
})

df2 = pd.DataFrame({
    'id': [2],
    'name': ['Usman']
})

pd.concat([df1,df2])


###  `combine_first()` ‚Äî **Fill missing values from another DataFrame**



#### üëâ Parameters

| Parameter | Meaning                               |
| --------- | ------------------------------------- |
| `other`   | DataFrame used to fill missing values |

üìå **Works index + column wise**

üìå Does **not overwrite existing non-NaN values**


In [None]:
df_a = pd.DataFrame({
    'name': ['Ali', None, 'Usman'],
    'age': [20, None, 23]
})

df_b = pd.DataFrame({
    'name': ['Ali', 'Sara', 'Usman'],
    'age': [20, 22, 23]
})

df_a.combine_first(df_b)

---
## __12. Apply / Vectorization__



### `df.apply()` -- **Row-wise / Column-wise logic**

#### üìå When to use

* Complex logic involving **multiple columns**
* When vectorized solution is not possible

### üëâ Important Parameters

| Parameter     | Default | Meaning                               |
| ------------- | ------- | ------------------------------------- |
| `func`        | ‚Äî       | Function to apply                     |
| `axis`        | `0`     | `0` ‚Üí column-wise<br>`1` ‚Üí row-wise   |
| `result_type` | `None`  | `'expand'`, `'reduce'`, `'broadcast'` |
| `raw`         | `False` | Pass ndarray instead of Series        |

‚ö†Ô∏è **Slower than vectorized operations**

In [46]:
# column wise logic
marks=pd.Series([2,3,4])
marks.apply(lambda x: x*x)

# row wise logic  Structure : Lambda x: value_if_true if condition else value_if_false
df['Seniority']=df.apply(lambda row: 'Senior' if row['Experience']>5 else 'Junior',axis=1)
df

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate,Seniority
0,101,Ali,IT,45000,1.0,2021-01-10,Junior
1,102,Sara,HR,52000,5.0,2018-05-20,Junior
2,102,,IT,48000,1.33,NaT,Junior
3,104,Zara,FINANCE,60000,7.0,2016-03-15,Senior
4,105,Usman,,52000,3.0,2019-07-01,Junior
5,106,Hina,IT,47000,1.0,2022-02-11,Junior
6,107,Bilal,HR,55000,4.0,2017-08-30,Junior
7,108,Nida,MARKETING,62000,3.88,2015-06-05,Junior
8,109,Omer,FINANCE,58000,8.0,2014-11-25,Senior
9,110,Ayesha,IT,50000,2.0,2020-09-01,Junior



###  `df.map()` ‚Äî **Element-wise (Series only)**

#### üìå When to use

* Apply function to **each value in a single column**
* Value transformation or mapping

### üëâ Important Parameters

| Parameter   | Meaning                      |
| ----------- | ---------------------------- |
| `arg`       | Function, dict, or Series    |
| `na_action` | `'ignore'` ‚Üí Skip NaN values |

üìå **Works on Series only**
üìå Faster than `apply` (but still not fully vectorized)

In [47]:
df['Status']=df['Salary'].map(lambda x:'Rich' if x>=50000 else 'Poor' )
df

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate,Seniority,Status
0,101,Ali,IT,45000,1.0,2021-01-10,Junior,Poor
1,102,Sara,HR,52000,5.0,2018-05-20,Junior,Rich
2,102,,IT,48000,1.33,NaT,Junior,Poor
3,104,Zara,FINANCE,60000,7.0,2016-03-15,Senior,Rich
4,105,Usman,,52000,3.0,2019-07-01,Junior,Rich
5,106,Hina,IT,47000,1.0,2022-02-11,Junior,Poor
6,107,Bilal,HR,55000,4.0,2017-08-30,Junior,Rich
7,108,Nida,MARKETING,62000,3.88,2015-06-05,Junior,Rich
8,109,Omer,FINANCE,58000,8.0,2014-11-25,Senior,Rich
9,110,Ayesha,IT,50000,2.0,2020-09-01,Junior,Rich



### `df.applymap()` ‚Äî **Element-wise on whole DataFrame**

#### üìå When to use

* Apply the **same operation to every cell**
* Rarely needed (mostly replaced by vectorization)

### üëâ Parameters

| Parameter | Meaning                           |
| --------- | --------------------------------- |
| `func`    | Function to apply to each element |

‚ö†Ô∏è **Slowest of all** ‚Üí avoid on large DataFrames

In [49]:
df_numeric = pd.DataFrame({
    'A': [1, 2],
    'B': [3, 4]
})

df_numeric.applymap(lambda x:x*2)

  df_numeric.applymap(lambda x:x*2)


Unnamed: 0,A,B
0,2,6
1,4,8




### `np.where()` ‚Äî **Fast vectorized condition** 

#### üìå When to use

* Conditional logic on arrays / columns
* Best replacement for many `apply` use-cases


### üëâ Parameters

| Parameter   | Meaning                     |
| ----------- | --------------------------- |
| `condition` | Boolean condition           |
| `x`         | Value if condition is True  |
| `y`         | Value if condition is False |

üìå **Fully vectorized ‚Üí fastest option**

In [50]:
df['QP']=np.where(df['Experience'] >= 5,3,1)


---
## __13. Export / Import in Pandas__

#### üîç Quick Comparison

| Method         | Direction | File Type | Common Use          |
| -------------- | --------- | --------- | ------------------- |
| `to_csv()`     | Export    | CSV       | Lightweight storage |
| `to_excel()`   | Export    | Excel     | Reports / sharing   |
| `read_csv()`   | Import    | CSV       | Large datasets      |
| `read_excel()` | Import    | Excel     | Business data       |

---


##  `df.to_csv()` ‚Äî **Save DataFrame to CSV**

### üìå When to use

* Store cleaned / processed data
* Share data between systems

### üëâ Sample DataFrame

```python
import pandas as pd

df = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['Ali', 'Sara', 'Usman'],
    'marks': [85, 90, 78]
})
```

### üëâ Usage

```python
df.to_csv('students.csv', index=False)
```

### üëâ Important Parameters

| Parameter     | Default | Meaning                   |
| ------------- | ------- | ------------------------- |
| `path_or_buf` | ‚Äî       | File path or buffer       |
| `index`       | `True`  | Write index to file       |
| `sep`         | `','`   | Column separator          |
| `header`      | `True`  | Write column names        |
| `mode`        | `'w'`   | `'w'` write, `'a'` append |
| `encoding`    | `None`  | File encoding (`'utf-8'`) |



###  `df.to_excel()` ‚Äî **Save DataFrame to Excel**

#### üìå When to use

* Reports, business sharing
* Multi-sheet Excel files

#### üëâ Usage

```python
df.to_excel('students.xlsx', index=False, sheet_name='Results')
```

#### üëâ Important Parameters

| Parameter      | Default    | Meaning                      |
| -------------- | ---------- | ---------------------------- |
| `excel_writer` | ‚Äî          | File path or ExcelWriter     |
| `sheet_name`   | `'Sheet1'` | Excel sheet name             |
| `index`        | `True`     | Write index                  |
| `engine`       | `None`     | `'openpyxl'`, `'xlsxwriter'` |
| `startrow`     | `0`        | Row to start writing         |
| `startcol`     | `0`        | Column to start writing      |


###  `pd.read_csv()` ‚Äî **Load CSV into DataFrame**

#### üìå When to use

* Load raw datasets
* Read logs, exports, public data

#### üëâ Usage

```python
df = pd.read_csv('students.csv')
```

#### üëâ Important Parameters

| Parameter            | Default   | Meaning                    |
| -------------------- | --------- | -------------------------- |
| `filepath_or_buffer` | ‚Äî         | File path                  |
| `sep`                | `','`     | Column delimiter           |
| `header`             | `'infer'` | Row to use as column names |
| `names`              | `None`    | Custom column names        |
| `index_col`          | `None`    | Column as index            |
| `usecols`            | `None`    | Select specific columns    |
| `dtype`              | `None`    | Force data types           |
| `na_values`          | `None`    | Custom missing values      |


###  `pd.read_excel()` ‚Äî **Load Excel into DataFrame**

#### üìå When to use

* Excel-based datasets
* Multiple sheets

#### üëâ Usage

```python
df = pd.read_excel('students.xlsx', sheet_name='Results')
```

#### üëâ Important Parameters

| Parameter    | Default | Meaning                |
| ------------ | ------- | ---------------------- |
| `io`         | ‚Äî       | File path              |
| `sheet_name` | `0`     | Sheet name or index    |
| `usecols`    | `None`  | Columns to read        |
| `skiprows`   | `None`  | Rows to skip           |
| `nrows`      | `None`  | Number of rows to read |
| `dtype`      | `None`  | Force column types     |
| `engine`     | `None`  | `'openpyxl'`, `'xlrd'` |
