In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('housing.csv')

In [3]:
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
dataset.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## There are 3 ways you can handle a missing value in a data:

1) **Dropna** method
2) **Drop** method
3) **Fillna** method


1) DROPNA():  removes any row or column that contains missing data.

You can control how it works:

**df.dropna()**	Removes rows with any missing value

**df.dropna(axis=1)**	Removes columns with any missing value

**df.dropna(subset=["A"])**	Removes rows where column A has missing values

**df.dropna(how="all")**	Removes only rows where all values are missing

In [None]:
dataset.dropna(subset=['total_bedrooms'], inplace=True)   #subset expects a list

In [19]:
dataset.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

2) DROP: The drop() method is used to remove rows or columns from a DataFrame

**df.drop(labels, axis=0 or 1, inplace=False)**

labels	The row index or column name(s) you want to drop

axis=0	Drop rows (default)

axis=1	Drop columns

inplace=True	Modify the original DataFrame

inplace=False	Return a new DataFrame

In [32]:
dataset.drop('total_bedrooms', axis=1, inplace=True)

In [33]:
dataset

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,741.0,349.0,1.8672,84700.0,INLAND


3) Fillna(): .fillna() is a method used to fill missing values (NaN) in a DataFrame or Series with a specific value.

In [36]:
median = dataset["total_bedrooms"].median()  # option 3

dataset["total_bedrooms"].fillna(median, inplace=True)

In [37]:
dataset.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

can also be written as :

In [40]:
dataset['total_bedrooms'].fillna(dataset['total_bedrooms'].median, inplace = True)

In [41]:
dataset.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

## Another way is by using Scikit learn library

# SimpleImputer

In [20]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
dataset_num = dataset.drop("ocean_proximity", axis=1)
imputer.fit(dataset_num)

X = imputer.transform(dataset_num)

dataset_imputed = pd.DataFrame(X, columns=dataset_num.columns)

print("After imputation:\n", dataset_imputed.isnull().sum())

After imputation:
 longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64


In [22]:
dataset_imputed.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64