In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("demo_trains.csv")

df

Unnamed: 0,Train No,Speed,City
0,12001,85,Delhi
1,12951,95,Mumbai
2,12309,110,Kolkata
3,12627,100,Chennai
4,12229,105,Bangalore
5,12801,90,Bhubaneswar
6,12423,115,Lucknow
7,12101,92,Nagpur
8,12791,98,Hyderabad
9,12953,108,Ahmedabad


In [3]:
# adding column in data frame 
# we have 2 methods for that
# using the assignment operator 
# using the insert function (used more professionally)
# df.insert(location, column_name, data)

df['Engine Age'] = df['Train No'] * 0.0007

df.insert(0, 'Updated Train No', (df['Train No'] * 0.7).astype(int))

df

Unnamed: 0,Updated Train No,Train No,Speed,City,Engine Age
0,8400,12001,85,Delhi,8.4007
1,9065,12951,95,Mumbai,9.0657
2,8616,12309,110,Kolkata,8.6163
3,8838,12627,100,Chennai,8.8389
4,8560,12229,105,Bangalore,8.5603
5,8960,12801,90,Bhubaneswar,8.9607
6,8696,12423,115,Lucknow,8.6961
7,8470,12101,92,Nagpur,8.4707
8,8953,12791,98,Hyderabad,8.9537
9,9067,12953,108,Ahmedabad,9.0671


In [4]:
# dropping row/cols
# axis 0 -> row 1 -> col in 2d grid


# df.drop(['Train No'], axis=1, inplace=True)
df.drop(columns = ['Train No'], inplace=True)

df

Unnamed: 0,Updated Train No,Speed,City,Engine Age
0,8400,85,Delhi,8.4007
1,9065,95,Mumbai,9.0657
2,8616,110,Kolkata,8.6163
3,8838,100,Chennai,8.8389
4,8560,105,Bangalore,8.5603
5,8960,90,Bhubaneswar,8.9607
6,8696,115,Lucknow,8.6961
7,8470,92,Nagpur,8.4707
8,8953,98,Hyderabad,8.9537
9,9067,108,Ahmedabad,9.0671


## Handeling missing data
### We have following type of missing datas in pandas
1. NaN (Not a Number) – the most common marker for missing data (from NumPy).
2.  None – also treated as missing in object dtype columns.
3. NaT (Not a Time) – used for missing datetime values.

#### 🧰 How to Detect Missing Data
| Function            | Description                                   |
| ------------------- | --------------------------------------------- |
| `df.isnull()`       | Returns `True` for missing entries            |
| `df.notnull()`      | Returns `True` for non-missing entries        |
| `df.isna()`         | Same as `isnull()`                            |
| `df.isnull().sum()` | Returns sum of missing value exists per column |

#### 🧹 How to Handle Missing Data 
| Method                         | Code Example                         | What It Does                                |
| ------------------------------ | ------------------------------------ | ------------------------------------------- |
| Drop rows with missing data    | `df.dropna()`                        | Removes rows with any `NaN`                 |
| Drop columns with missing data | `df.dropna(axis=1)`                  | Removes columns with any `NaN`              |
| Fill missing values            | `df.fillna(0)`                       | Replaces all `NaN` with 0                   |
| Forward fill                   | `df.ffill())`                        | Fills with previous value                   |
| Backward fill                  | `df.bfill()`                         | Fills with next value                       |
| Fill with column mean          | `df['age'].fillna(df['age'].mean())` | Replaces `NaN` with the mean of that column |

> or we can use interpolate() to fill intelligently


#### When to Use and When to Avoid

✅ **Use Interpolation when:**

  * You are working with **ordered data**, especially time series (sensor readings, stock prices).
  * The missing data is likely to follow the trend of its neighbors.
  * The gaps of missing data are relatively small.

❌ **Avoid Interpolation when:**

  * You are working with **categorical data** (e.g., 'red', 'blue', 'green'). Interpolation has no meaning here.
  * The data is not ordered or has no logical sequence.
  * The gaps of missing data are very large, as the estimations become unreliable.
  * You need to fill `NaN` values at the very beginning or end of a series, as standard interpolation has no second point to use (unless you use `limit_direction='both'` or methods like `bfill`/`ffill`).


In [5]:
df = pd.DataFrame({
    'name': ['Alice', 'Bob', None],
    'age': [25, np.nan, 30],
    'city': ['Delhi', 'Mumbai', None]
})

print('Initial Data')
print(df)

# df.isnull().sum()

df['age'] = df['age'].fillna(df['age'].mean())
df

# df.dropna()
# df.ffill()
# df.bfill()

Initial Data
    name   age    city
0  Alice  25.0   Delhi
1    Bob   NaN  Mumbai
2   None  30.0    None


Unnamed: 0,name,age,city
0,Alice,25.0,Delhi
1,Bob,27.5,Mumbai
2,,30.0,


In [6]:
# using interpolate function

df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, np.nan, np.nan, 7],
    'B': [10, np.nan, 30, 43, np.nan, 67, 70]
})

print('Before Interpolation')
print(df)
# methods: linear, polynomial, time etc

df_linear = df.interpolate()
print("\nLinear Interpolation:")
print(df_linear)

df_poly = df.interpolate(method='polynomial', order=2)
print("\nPolynomial Interpolation (order=2):")
print(df_poly)

# Limit to filling only one consecutive NaN at a time, going forward
df_limited = df.interpolate(limit=1, limit_direction='forward')
print("\nLimited Forward Interpolation:")
print(df_limited)


# for time series data 
time_df = pd.DataFrame({
    'Stock Price': [20.0, np.nan, 21.0, np.nan, 24.0]
}, index=pd.to_datetime(['2025-07-19 10:00', '2025-07-19 10:10',
                        '2025-07-19 10:15', '2025-07-19 10:45',
                        '2025-07-19 11:00']))

print("\nTime Series DataFrame:")
print(time_df)

print("\nTime-based Interpolation:")
print(time_df.interpolate(method='time'))

Before Interpolation
     A     B
0  1.0  10.0
1  2.0   NaN
2  NaN  30.0
3  4.0  43.0
4  NaN   NaN
5  NaN  67.0
6  7.0  70.0

Linear Interpolation:
     A     B
0  1.0  10.0
1  2.0  20.0
2  3.0  30.0
3  4.0  43.0
4  5.0  55.0
5  6.0  67.0
6  7.0  70.0

Polynomial Interpolation (order=2):
     A          B
0  1.0  10.000000
1  2.0  18.931624
2  3.0  30.000000
3  4.0  43.000000
4  5.0  56.700855
5  6.0  67.000000
6  7.0  70.000000

Limited Forward Interpolation:
     A     B
0  1.0  10.0
1  2.0  20.0
2  3.0  30.0
3  4.0  43.0
4  5.0  55.0
5  NaN  67.0
6  7.0  70.0

Time Series DataFrame:
                     Stock Price
2025-07-19 10:00:00         20.0
2025-07-19 10:10:00          NaN
2025-07-19 10:15:00         21.0
2025-07-19 10:45:00          NaN
2025-07-19 11:00:00         24.0

Time-based Interpolation:
                     Stock Price
2025-07-19 10:00:00    20.000000
2025-07-19 10:10:00    20.666667
2025-07-19 10:15:00    21.000000
2025-07-19 10:45:00    23.000000
2025-07-19 11:00:

In [7]:
# sorting

languageAge = pd.DataFrame({
    'Name': ['Java', 'Pyhton', 'Cplusplus', 'Javascript', 'Typescript'],
    'Age': [100, 30, 140, 80, 40],
})

print('Sorted by age: ')
languageAge.sort_values(by=['Name', 'Age'], ascending=True, inplace=True)
languageAge


Sorted by age: 


Unnamed: 0,Name,Age
2,Cplusplus,140
0,Java,100
3,Javascript,80
1,Pyhton,30
4,Typescript,40


In [8]:
# aggregation function - sum, mean, min, max, count, std

df2 = pd.DataFrame({
    'department': ['HR', 'HR', 'IT', 'IT', 'Finance', 'IT'],
    'employee': ['Alice', 'Kevin', 'Bob', 'Charlie', 'David', 'Eva'],
    'salary': [50000, 60000, 70000, 80000, 75000, 90000]
})

# grouped = df2.groupby(['department', 'employee'])
grouped = df2.groupby('department')

for name, group in grouped:
    print(f"Group {name}")
    print(group, end='\n\n')

grouped_salary = grouped['salary']
print('\nDepartment grouped by salaries: ')
print(grouped_salary.mean())

# Custom Aggregation - agg({ pass_colum_here_for_aggregation: list_of_aggregation_function })
# df2.groupby('department').agg({'salary': ['sum', 'mean']})
df2.groupby('department').agg({
    'salary': ['mean', 'sum'],
    'employee': 'count'
})


Group Finance
  department employee  salary
4    Finance    David   75000

Group HR
  department employee  salary
0         HR    Alice   50000
1         HR    Kevin   60000

Group IT
  department employee  salary
2         IT      Bob   70000
3         IT  Charlie   80000
5         IT      Eva   90000


Department grouped by salaries: 
department
Finance    75000.0
HR         55000.0
IT         80000.0
Name: salary, dtype: float64


Unnamed: 0_level_0,salary,salary,employee
Unnamed: 0_level_1,mean,sum,count
department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Finance,75000.0,75000,1
HR,55000.0,110000,2
IT,80000.0,240000,3
