## Segment 2 - Treating missing values

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

### Figuring out what data is missing

In [2]:
#Reading from CSV, more ref: https://www.datacamp.com/tutorial/pandas-read-csv
d1=pd.read_csv('./mtcars.csv')
d1.tail(2)

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
30,Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8
31,Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,1,1,4,2


In [13]:
missing = np.nan
series_obj = Series(['row 1','row 2',missing,'row 4','row 5','row 6',missing,'row 8'])
series_obj

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object

In [14]:
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

In [15]:
series_obj.isnull().sum()

np.int64(2)

In [16]:
series_obj.dropna()  #dropin nan

0    row 1
1    row 2
3    row 4
4    row 5
5    row 6
7    row 8
dtype: object

### Filling in for missing values

In [17]:
np.random.seed(25)
DF_obj = pd.DataFrame(np.random.rand(36).reshape(6,6))
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [8]:
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [9]:
filled_DF = DF_obj.fillna(0)
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.0
3,0.0,0.836375,0.481343,0.516502,0.383048,0.0
4,0.0,0.559053,0.03445,0.71993,0.421004,0.0
5,0.0,0.900274,0.669612,0.456069,0.289804,0.525819


In [10]:
filled_DF = DF_obj.fillna({0: 0.1, 5:1.25})
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,1.25
2,0.447031,0.585445,0.161985,0.520719,0.326051,1.25
3,0.1,0.836375,0.481343,0.516502,0.383048,1.25
4,0.1,0.559053,0.03445,0.71993,0.421004,1.25
5,0.1,0.900274,0.669612,0.456069,0.289804,0.525819


In [11]:
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [12]:
fill_DF = DF_obj.fillna(method='ffill')
fill_DF

  fill_DF = DF_obj.fillna(method='ffill')


Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.117376
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.117376
3,0.447031,0.836375,0.481343,0.516502,0.383048,0.117376
4,0.447031,0.559053,0.03445,0.71993,0.421004,0.117376
5,0.447031,0.900274,0.669612,0.456069,0.289804,0.525819


### Counting missing values

In [18]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape(6,6))
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [19]:
DF_obj.isnull().sum()

Unnamed: 0,0
0,3
1,0
2,0
3,0
4,0
5,4


In [21]:
DF_obj[0] = DF_obj[0].fillna(DF_obj[0].mean())
DF_obj


Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,0.667375,0.836375,0.481343,0.516502,0.383048,
4,0.667375,0.559053,0.03445,0.71993,0.421004,
5,0.667375,0.900274,0.669612,0.456069,0.289804,0.525819


### Filtering out missing values

In [None]:
DF_no_NaN = DF_obj.dropna()
DF_no_NaN

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376


In [None]:
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN
df=pd.read_csv('.csv ')

Unnamed: 0,1,2,3,4
0,0.582277,0.278839,0.185911,0.4111
1,0.437611,0.556229,0.36708,0.402366
2,0.585445,0.161985,0.520719,0.326051
3,0.836375,0.481343,0.516502,0.383048
4,0.559053,0.03445,0.71993,0.421004
5,0.900274,0.669612,0.456069,0.289804


# Class exercise 1:
Hereâ€™s a sample dataset you can use. You can copy this directly into a DataFrame:

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', None],
    'Age': [24, None, 22, 23, None, 29],
    'Score': [85, 70, None, 88, 95, 90],
    'City': ['New York', 'Los Angeles', None, 'Chicago', 'Houston', None]
}

df = pd.DataFrame(data)

print(df)

Tasks:
1. Detecting Missing Values
Check for missing values in the dataset
2. Dropping Missing Values


*   Drop rows with any missing values.
d1=df.dropna(how='any')
*   Drop rows only if all values in the row are missing.

d1=df.dropna(how='all')

d1=df[df.isnull().sum(axis=1)!=len(data.keys())]

*   Drop columns with missing values

3. Filling Missing Values

*   Fill with a fixed value (e.g., 0 or 'Unknown').
*   Fill using forward-fill (propagate last valid value).
*   Fill using backward-fill.

4. Filling with Mean/Median/Mode

*   Fill numerical columns with the mean value.
*   Fill categorical columns with the mode.

5. Replacing Values with replace()

 Replace specific values like None or NaN.

















#Class exercise 2:
use https://www.kaggle.com/datasets/gunjanpathak/melb-data dataset and find the missing values and replace it with mean values.