###### Here, we are performing various data cleaning techniques on the given data set. 

In [2]:
# Importing dependencies

import pandas as pd
import numpy as np

## Loading Data 

In [32]:
df = pd.read_csv("property data.csv")
df

Unnamed: 0,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS
0,104.0,PUTNAM,Y,3
1,197.0,LEXINGTON,N,3
2,,LEXINGTON,N,
3,201.0,BERKELEY,12,1
4,203.0,BERKELEY,Y,3
5,207.0,BERKELEY,Y,
6,,WASHINGTON,,2
7,213.0,TREMONT,Y,--
8,215.0,TREMONT,Y,na


In [8]:
df.shape

(9, 4)

### Standard Types

In [12]:
df['ST_NUM']

0    104.0
1    197.0
2      NaN
3    201.0
4    203.0
5    207.0
6      NaN
7    213.0
8    215.0
Name: ST_NUM, dtype: float64

In [15]:
# Getting information about the null values present

df['ST_NUM'].isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
8    False
Name: ST_NUM, dtype: bool

### Non- standard Types

In [16]:
df['NUM_BEDROOMS'].isnull()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8    False
Name: NUM_BEDROOMS, dtype: bool

The above table showing only two null values instead of four, so:

In [17]:
# Making list of all types on null values that could be there in the dataframe

missing_values = ["n/a", "na", "--"]
df = pd.read_csv("property data.csv", na_values = missing_values )

In [18]:
df['NUM_BEDROOMS'].isnull()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7     True
8     True
Name: NUM_BEDROOMS, dtype: bool

### Unexpected Types

In [19]:
df['OWN_OCCUPIED'].isnull()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
8    False
Name: OWN_OCCUPIED, dtype: bool

The above table treating an integer value as a non null value, so:

In [20]:
cnt = 0
for row in df['OWN_OCCUPIED']:
    try:
        int(row)
        df.loc[cnt, 'OWN_OCCUPIED']=np.nan
    except ValueError:
        pass
    cnt+=1

df['OWN_OCCUPIED'].isnull()
        

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7    False
8    False
Name: OWN_OCCUPIED, dtype: bool

### Summarizing

In [25]:
# Total null values in each column

df.isnull().sum()

ST_NUM          2
ST_NAME         0
OWN_OCCUPIED    2
NUM_BEDROOMS    4
dtype: int64

In [39]:
# Are there any null value(s) present

df.isnull().values.any()

True

In [28]:
# Total number of null values present

df.isnull().sum().sum()

8

### Replacing

In [29]:
df['ST_NUM']

0    104.0
1    197.0
2      NaN
3    201.0
4    203.0
5    207.0
6      NaN
7    213.0
8    215.0
Name: ST_NUM, dtype: float64

In [31]:
# Replacing all null values with a spefic value

df['ST_NUM'].fillna(125,inplace = True)
df['ST_NUM']

0    104.0
1    197.0
2    125.0
3    201.0
4    203.0
5    207.0
6    125.0
7    213.0
8    215.0
Name: ST_NUM, dtype: float64

In [33]:
# Adding a value to a particular location

df.loc[2, 'ST_NUM'] = 125
df['ST_NUM']

0    104.0
1    197.0
2    125.0
3    201.0
4    203.0
5    207.0
6      NaN
7    213.0
8    215.0
Name: ST_NUM, dtype: float64