### Pandas - Missing Values

In [1]:
import pandas as pd
import numpy as np

### Functions

In [None]:
1. Detect Missing Values:
    1. df.isna()
    2. df.isnull()
    3. df.isna().sum()
    4. df.isna().mean()
    5. df.isna().mean() * 100
    
2. Drop Null Values:
    1. df.dropna():
        1. df.dropna(thresh=df.shape[1], axis = 0)
        2. df.dropna(thresh=df.shape[0], axis = 1)
        
    2. df.drop():
        df.drop('ColName', axis = 1)
        df.drop(['Col1', 'Col2', ...,'ColN'], axis = 1)
        df.drop(['Row1', 'Row2',..., 'RowN'], axis = 0)
    
    
3. Imputation:
    1. df.fillna()
        Replace Missing values by using Stats:

        Continuous Data:
            1. df['ColName'].mean()
            2. df['ColName'].median()

        Categorical Data:
            1. df['ColName'].mode()[0]

In [2]:
array = np.random.randint(5,50, size = (7,4))
array

array([[48, 35, 15, 33],
       [14, 48, 38, 31],
       [27, 13, 16, 43],
       [29, 42, 44, 38],
       [34, 25, 14, 12],
       [ 6, 26, 24, 30],
       [18, 19, 45, 27]])

In [3]:
df = pd.DataFrame(array, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
0,48,35,15,33
1,14,48,38,31
2,27,13,16,43
3,29,42,44,38
4,34,25,14,12
5,6,26,24,30
6,18,19,45,27


In [4]:
emp_df = pd.read_csv("Emp_data.csv")
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60,13.68,168251,Denver
1,940761,Brenda,47.02,60,9.01,51063,Stonewall
2,428945,Joe,54.15,68,0.98,50155,Michigantown
3,408351,Diane,39.67,51,18.30,180294,Hydetown
4,193819,Benjamin,40.31,58,4.01,117642,Fremont
...,...,...,...,...,...,...,...
95,639892,Jose,22.82,89,1.05,129774,Biloxi
96,704709,Harold,32.61,77,5.93,156194,Carol Stream
97,461593,Nicole,52.66,60,28.53,95673,Detroit
98,392491,Theresa,29.60,57,6.99,51015,Mc Grath


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       7 non-null      int32
 1   B       7 non-null      int32
 2   C       7 non-null      int32
 3   D       7 non-null      int32
dtypes: int32(4)
memory usage: 240.0 bytes


In [6]:
emp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Emp ID          100 non-null    int64  
 1   First Name      100 non-null    object 
 2   Age in Yrs      100 non-null    float64
 3   Weight in Kgs   100 non-null    int64  
 4   Age in Company  100 non-null    float64
 5   Salary          100 non-null    int64  
 6   City            100 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 5.6+ KB


### Detect Missing Values

In [7]:
df.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False


In [9]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False


In [10]:
emp_df.isna()

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
95,False,False,False,False,False,False,False
96,False,False,False,False,False,False,False
97,False,False,False,False,False,False,False
98,False,False,False,False,False,False,False


In [12]:
df.isna().sum()

A    0
B    0
C    0
D    0
dtype: int64

In [13]:
df

Unnamed: 0,A,B,C,D
0,48,35,15,33
1,14,48,38,31
2,27,13,16,43
3,29,42,44,38
4,34,25,14,12
5,6,26,24,30
6,18,19,45,27


In [15]:
df.iloc[2:4, 1] = np.nan
df

Unnamed: 0,A,B,C,D
0,48,35.0,15,33
1,14,48.0,38,31
2,27,,16,43
3,29,,44,38
4,34,25.0,14,12
5,6,26.0,24,30
6,18,19.0,45,27


In [17]:
df.loc[[1,2,5],'D'] = np.nan
df

Unnamed: 0,A,B,C,D
0,48,35.0,15,33.0
1,14,48.0,38,
2,27,,16,
3,29,,44,38.0
4,34,25.0,14,12.0
5,6,26.0,24,
6,18,19.0,45,27.0


In [20]:
df.isna().sum()

A    0
B    2
C    0
D    3
dtype: int64

In [23]:
emp_df = pd.read_csv('Emp_data.csv').head(10)
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
2,428945,Joe,,,,,Michigantown
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,,,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [26]:
emp_df.isnull().sum()

Emp ID            0
First Name        0
Age in Yrs        4
Weight in Kgs     3
Age in Company    4
Salary            4
City              0
dtype: int64

In [28]:
emp_df.isnull().mean()

Emp ID            0.0
First Name        0.0
Age in Yrs        0.4
Weight in Kgs     0.3
Age in Company    0.4
Salary            0.4
City              0.0
dtype: float64

In [29]:
emp_df.isnull().mean() * 100

Emp ID             0.0
First Name         0.0
Age in Yrs        40.0
Weight in Kgs     30.0
Age in Company    40.0
Salary            40.0
City               0.0
dtype: float64

### Handling of Missing Values

In [None]:
1. drop rows or Columns
2. Imputation Techniques

In [35]:
emp_df = pd.read_csv('Emp_data.csv').head(12)
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
2,428945,Joe,,,,,Michigantown
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,,,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [36]:
emp_df.dropna()

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs
11,153989,Jack,22.21,61.0,0.56,82965.0,Las Vegas


In [38]:
emp_df.dropna(thresh=7)

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs
11,153989,Jack,22.21,61.0,0.56,82965.0,Las Vegas


In [40]:
emp_df.dropna(thresh=emp_df.shape[1])

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs
11,153989,Jack,22.21,61.0,0.56,82965.0,Las Vegas


In [41]:
emp_df.dropna(thresh = 6)

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs
10,231469,Ralph,42.5,,8.29,118457.0,Sabetha
11,153989,Jack,22.21,61.0,0.56,82965.0,Las Vegas


In [42]:
emp_df.dropna(thresh = 5) # 5 Non Null Values

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs
10,231469,Ralph,42.5,,8.29,118457.0,Sabetha
11,153989,Jack,22.21,61.0,0.56,82965.0,Las Vegas


In [44]:
emp_df.dropna(thresh = 4) # 5 Non Null Values

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,,,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs
10,231469,Ralph,42.5,,8.29,118457.0,Sabetha


In [45]:
emp_df.dropna(thresh = 3) # 3 Non Null Values

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
2,428945,Joe,,,,,Michigantown
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,,,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [47]:
emp_df.dropna(axis = 0, thresh = 7)

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs
11,153989,Jack,22.21,61.0,0.56,82965.0,Las Vegas


In [48]:
emp_df.dropna()

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs
11,153989,Jack,22.21,61.0,0.56,82965.0,Las Vegas


In [49]:
emp_df.isna().sum()

Emp ID            0
First Name        0
Age in Yrs        4
Weight in Kgs     4
Age in Company    4
Salary            4
City              0
dtype: int64

In [51]:
emp_df = pd.read_csv('Emp_data.csv').head(12)
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
2,428945,Joe,,,,,Michigantown
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,34.52,60918.0,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [52]:
emp_df.isna().sum()

Emp ID            0
First Name        0
Age in Yrs        4
Weight in Kgs     4
Age in Company    3
Salary            2
City              0
dtype: int64

In [54]:
emp_df.dropna(axis = 1,thresh=emp_df.shape[0])

Unnamed: 0,Emp ID,First Name,City
0,677509,Lois,Denver
1,940761,Brenda,Stonewall
2,428945,Joe,Michigantown
3,408351,Diane,Hydetown
4,193819,Benjamin,Fremont
5,499687,Patrick,Macksburg
6,539712,Nancy,Atlanta
7,380086,Carol,Blanchester
8,477616,Frances,Delmita
9,162402,Diana,Eureka Springs


In [56]:
emp_df.dropna(axis = 1,thresh=12)

Unnamed: 0,Emp ID,First Name,City
0,677509,Lois,Denver
1,940761,Brenda,Stonewall
2,428945,Joe,Michigantown
3,408351,Diane,Hydetown
4,193819,Benjamin,Fremont
5,499687,Patrick,Macksburg
6,539712,Nancy,Atlanta
7,380086,Carol,Blanchester
8,477616,Frances,Delmita
9,162402,Diana,Eureka Springs


In [58]:
emp_df.dropna(axis = 1,thresh=11)

Unnamed: 0,Emp ID,First Name,City
0,677509,Lois,Denver
1,940761,Brenda,Stonewall
2,428945,Joe,Michigantown
3,408351,Diane,Hydetown
4,193819,Benjamin,Fremont
5,499687,Patrick,Macksburg
6,539712,Nancy,Atlanta
7,380086,Carol,Blanchester
8,477616,Frances,Delmita
9,162402,Diana,Eureka Springs


In [60]:
emp_df.dropna(axis = 1,thresh=10)

Unnamed: 0,Emp ID,First Name,Salary,City
0,677509,Lois,168251.0,Denver
1,940761,Brenda,51063.0,Stonewall
2,428945,Joe,,Michigantown
3,408351,Diane,,Hydetown
4,193819,Benjamin,117642.0,Fremont
5,499687,Patrick,72305.0,Macksburg
6,539712,Nancy,72305.0,Atlanta
7,380086,Carol,60918.0,Blanchester
8,477616,Frances,60918.0,Delmita
9,162402,Diana,43010.0,Eureka Springs


In [61]:
emp_df.dropna(axis = 1,thresh=9)

Unnamed: 0,Emp ID,First Name,Age in Company,Salary,City
0,677509,Lois,13.68,168251.0,Denver
1,940761,Brenda,,51063.0,Stonewall
2,428945,Joe,,,Michigantown
3,408351,Diane,18.3,,Hydetown
4,193819,Benjamin,,117642.0,Fremont
5,499687,Patrick,12.02,72305.0,Macksburg
6,539712,Nancy,0.87,72305.0,Atlanta
7,380086,Carol,34.52,60918.0,Blanchester
8,477616,Frances,34.52,60918.0,Delmita
9,162402,Diana,3.44,43010.0,Eureka Springs


In [63]:
emp_df.dropna(axis = 1,thresh=emp_df.shape[0] - 3)

Unnamed: 0,Emp ID,First Name,Age in Company,Salary,City
0,677509,Lois,13.68,168251.0,Denver
1,940761,Brenda,,51063.0,Stonewall
2,428945,Joe,,,Michigantown
3,408351,Diane,18.3,,Hydetown
4,193819,Benjamin,,117642.0,Fremont
5,499687,Patrick,12.02,72305.0,Macksburg
6,539712,Nancy,0.87,72305.0,Atlanta
7,380086,Carol,34.52,60918.0,Blanchester
8,477616,Frances,34.52,60918.0,Delmita
9,162402,Diana,3.44,43010.0,Eureka Springs


### df.drop

In [None]:
df.drop('ColName', axis = 1)
df.drop(['Col1', 'Col2', ...,'ColN'], axis = 1)
df.drop(['Row1', 'Row2',..., 'RowN'], axis = 0)

In [64]:
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
2,428945,Joe,,,,,Michigantown
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,34.52,60918.0,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [65]:
emp_df.isna().mean() * 100

Emp ID             0.000000
First Name         0.000000
Age in Yrs        33.333333
Weight in Kgs     33.333333
Age in Company    25.000000
Salary            16.666667
City               0.000000
dtype: float64

In [67]:
emp_df.drop('Age in Yrs', axis = 1)

Unnamed: 0,Emp ID,First Name,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,60.0,13.68,168251.0,Denver
1,940761,Brenda,60.0,,51063.0,Stonewall
2,428945,Joe,,,,Michigantown
3,408351,Diane,51.0,18.3,,Hydetown
4,193819,Benjamin,,,117642.0,Fremont
5,499687,Patrick,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,,0.87,72305.0,Atlanta
7,380086,Carol,40.0,34.52,60918.0,Blanchester
8,477616,Frances,42.0,34.52,60918.0,Delmita
9,162402,Diana,60.0,3.44,43010.0,Eureka Springs


In [71]:
emp_df.drop(['Age in Yrs','Weight in Kgs'], axis = 1)

Unnamed: 0,Emp ID,First Name,Age in Company,Salary,City
0,677509,Lois,13.68,168251.0,Denver
1,940761,Brenda,,51063.0,Stonewall
2,428945,Joe,,,Michigantown
3,408351,Diane,18.3,,Hydetown
4,193819,Benjamin,,117642.0,Fremont
5,499687,Patrick,12.02,72305.0,Macksburg
6,539712,Nancy,0.87,72305.0,Atlanta
7,380086,Carol,34.52,60918.0,Blanchester
8,477616,Frances,34.52,60918.0,Delmita
9,162402,Diana,3.44,43010.0,Eureka Springs


In [73]:
emp_df = emp_df.drop(['Emp ID','First Name'], axis = 1)
emp_df

Unnamed: 0,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,36.36,60.0,13.68,168251.0,Denver
1,,60.0,,51063.0,Stonewall
2,,,,,Michigantown
3,,51.0,18.3,,Hydetown
4,40.31,,,117642.0,Fremont
5,34.86,58.0,12.02,72305.0,Macksburg
6,22.14,,0.87,72305.0,Atlanta
7,59.12,40.0,34.52,60918.0,Blanchester
8,,42.0,34.52,60918.0,Delmita
9,29.73,60.0,3.44,43010.0,Eureka Springs


In [74]:
emp_df

Unnamed: 0,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,36.36,60.0,13.68,168251.0,Denver
1,,60.0,,51063.0,Stonewall
2,,,,,Michigantown
3,,51.0,18.3,,Hydetown
4,40.31,,,117642.0,Fremont
5,34.86,58.0,12.02,72305.0,Macksburg
6,22.14,,0.87,72305.0,Atlanta
7,59.12,40.0,34.52,60918.0,Blanchester
8,,42.0,34.52,60918.0,Delmita
9,29.73,60.0,3.44,43010.0,Eureka Springs


In [83]:
emp_df = pd.read_csv('Emp_data.csv').head(12)
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
2,428945,Joe,,,,,Michigantown
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,34.52,60918.0,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [84]:
emp_df.drop('Emp ID', axis = 1, inplace=False)

Unnamed: 0,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,Lois,36.36,60.0,13.68,168251.0,Denver
1,Brenda,,60.0,,51063.0,Stonewall
2,Joe,,,,,Michigantown
3,Diane,,51.0,18.3,,Hydetown
4,Benjamin,40.31,,,117642.0,Fremont
5,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,Nancy,22.14,,0.87,72305.0,Atlanta
7,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,Frances,,42.0,34.52,60918.0,Delmita
9,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [85]:
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
2,428945,Joe,,,,,Michigantown
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,34.52,60918.0,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [86]:
emp_df.drop('Emp ID', axis = 1, inplace=True)

In [90]:
emp_df.drop([5,7,9])

Unnamed: 0,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,Lois,36.36,60.0,13.68,168251.0,Denver
1,Brenda,,60.0,,51063.0,Stonewall
2,Joe,,,,,Michigantown
3,Diane,,51.0,18.3,,Hydetown
4,Benjamin,40.31,,,117642.0,Fremont
6,Nancy,22.14,,0.87,72305.0,Atlanta
8,Frances,,42.0,34.52,60918.0,Delmita
10,Ralph,42.5,,8.29,118457.0,Sabetha
11,Jack,22.21,61.0,0.56,82965.0,Las Vegas


In [91]:
emp_df.drop([5,7,9],axis = 0)

Unnamed: 0,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,Lois,36.36,60.0,13.68,168251.0,Denver
1,Brenda,,60.0,,51063.0,Stonewall
2,Joe,,,,,Michigantown
3,Diane,,51.0,18.3,,Hydetown
4,Benjamin,40.31,,,117642.0,Fremont
6,Nancy,22.14,,0.87,72305.0,Atlanta
8,Frances,,42.0,34.52,60918.0,Delmita
10,Ralph,42.5,,8.29,118457.0,Sabetha
11,Jack,22.21,61.0,0.56,82965.0,Las Vegas


In [93]:
list1 = [3,4,5,6,7]
x = list1.append(100)
print(x)

None


In [94]:
df2 = emp_df.drop('First Name', axis = 1, inplace=True)
print(df2)

None


In [97]:
df3 = emp_df.drop('City', axis = 1, inplace=False)
print(df3)
emp_df

    Age in Yrs  Weight in Kgs  Age in Company    Salary
0        36.36           60.0           13.68  168251.0
1          NaN           60.0             NaN   51063.0
2          NaN            NaN             NaN       NaN
3          NaN           51.0           18.30       NaN
4        40.31            NaN             NaN  117642.0
5        34.86           58.0           12.02   72305.0
6        22.14            NaN            0.87   72305.0
7        59.12           40.0           34.52   60918.0
8          NaN           42.0           34.52   60918.0
9        29.73           60.0            3.44   43010.0
10       42.50            NaN            8.29  118457.0
11       22.21           61.0            0.56   82965.0


Unnamed: 0,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,36.36,60.0,13.68,168251.0,Denver
1,,60.0,,51063.0,Stonewall
2,,,,,Michigantown
3,,51.0,18.3,,Hydetown
4,40.31,,,117642.0,Fremont
5,34.86,58.0,12.02,72305.0,Macksburg
6,22.14,,0.87,72305.0,Atlanta
7,59.12,40.0,34.52,60918.0,Blanchester
8,,42.0,34.52,60918.0,Delmita
9,29.73,60.0,3.44,43010.0,Eureka Springs


In [98]:
string = 'python'
s1 = string.upper()
print(s1)

PYTHON


In [100]:
heart_df = pd.read_csv('heart.csv')
heart_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [102]:
x = heart_df.drop('target',axis = 1)
y = heart_df['target']
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

### Imputation Techniques

In [104]:
emp_df = pd.read_csv('Emp_data.csv').head(12)
emp_df.isna().sum()

Emp ID            0
First Name        0
Age in Yrs        4
Weight in Kgs     4
Age in Company    3
Salary            2
City              0
dtype: int64

In [109]:
emp_df.fillna(1000)

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,1000.0,60.0,1000.0,51063.0,Stonewall
2,428945,Joe,1000.0,1000.0,1000.0,1000.0,Michigantown
3,408351,Diane,1000.0,51.0,18.3,1000.0,Hydetown
4,193819,Benjamin,40.31,1000.0,1000.0,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,1000.0,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,1000.0,42.0,34.52,60918.0,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [114]:
emp_df['Age in Yrs'] = emp_df['Age in Yrs'].fillna(35)
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,35.0,60.0,,51063.0,Stonewall
2,428945,Joe,35.0,,,,Michigantown
3,408351,Diane,35.0,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,35.0,42.0,34.52,60918.0,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [118]:
emp_df['Weight in Kgs'].fillna(55, inplace=True)
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,35.0,60.0,,51063.0,Stonewall
2,428945,Joe,35.0,55.0,,,Michigantown
3,408351,Diane,35.0,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,55.0,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,55.0,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,35.0,42.0,34.52,60918.0,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


### Statistics

In [None]:
Mean and Median values are used for continuous Data 

##### Imputation for Continuous Data

In [120]:
emp_df['Age in Company'].mean()

14.02222222222222

In [121]:
emp_df['Age in Company'].median()

12.02

In [122]:
np.mean(emp_df['Age in Company'])

14.02222222222222

In [124]:
emp_df['Age in Company'].median()

12.02

In [125]:
array = np.array([10,14,16,12,17,11,18,19,13])
np.mean(array)

14.444444444444445

In [126]:
array = np.array([10,14,16,12,170,11,18,19,13])
np.mean(array)

31.444444444444443

In [127]:
array = np.array([10,14,16,12,17,11,18,19,13])
np.median(array)

14.0

In [128]:
array = np.array([10,14,16,12,170,11,18,19,13])
np.median(array)

14.0

In [199]:
emp_df = pd.read_csv('Emp_data.csv').head(12)
emp_df.isna().sum()

Emp ID            0
First Name        0
Age in Yrs        4
Weight in Kgs     4
Age in Company    3
Salary            2
City              0
dtype: int64

In [200]:
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
2,428945,Joe,,,,,Michigantown
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,34.52,60918.0,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [201]:
mean_age = emp_df['Age in Yrs'].mean()
mean_age

35.903749999999995

In [202]:
emp_df['Age in Yrs'].fillna(mean_age, inplace=True)

In [203]:
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,35.90375,60.0,,51063.0,Stonewall
2,428945,Joe,35.90375,,,,Michigantown
3,408351,Diane,35.90375,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,35.90375,42.0,34.52,60918.0,Delmita
9,162402,Diana,29.73,60.0,3.44,43010.0,Eureka Springs


In [204]:
emp_df['Weight in Kgs'].median()

59.0

In [205]:
emp_df['Weight in Kgs'].fillna(emp_df['Weight in Kgs'].median(), inplace=True)

In [206]:
emp_df['Salary'].fillna(emp_df['Salary'].median(), inplace=True)

In [207]:
emp_df['Age in Company'].fillna(emp_df['Age in Company'].mean(), inplace=True)

In [198]:
emp_df.isna().sum()

Emp ID            0
First Name        0
Age in Yrs        0
Weight in Kgs     0
Age in Company    0
Salary            0
City              0
dtype: int64

In [158]:
emp_df = pd.read_csv('Emp_data.csv').head(12)
emp_df['Age in Yrs'].mode()[0]

22.14

In [162]:
emp_df['Age in Yrs'].iloc[9] = 59.12
emp_df['Age in Yrs']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emp_df['Age in Yrs'].iloc[9] = 59.12


0     36.36
1       NaN
2       NaN
3       NaN
4     40.31
5     34.86
6     22.14
7     59.12
8       NaN
9     59.12
10    42.50
11    22.21
Name: Age in Yrs, dtype: float64

In [164]:
emp_df['Age in Yrs'].mode()

0    59.12
Name: Age in Yrs, dtype: float64

### Imputation for Categorical Data

In [166]:
titanic_df = pd.read_csv('titanic.csv')
titanic_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           2
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [170]:
titanic_df['Gender'].mode()[0]

'male'

In [172]:
titanic_df['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [173]:
titanic_df['Embarked'].mode()[0]

'S'

In [175]:
titanic_df['Embarked'].replace({'S':0,'C':1,'Q':2},inplace = True)

In [177]:
titanic_df['Embarked'].mean()

0.36220472440944884

In [178]:
titanic_df['Embarked'].median()

0.0

In [179]:
titanic_df = pd.read_csv('titanic.csv')
titanic_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           2
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [180]:
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)

In [182]:
titanic_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           2
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [183]:
titanic_df['Gender'].fillna(titanic_df['Gender'].mode()[0], inplace=True)

In [185]:
titanic_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [187]:
emp_df

Unnamed: 0,Emp ID,First Name,Age in Yrs,Weight in Kgs,Age in Company,Salary,City
0,677509,Lois,36.36,60.0,13.68,168251.0,Denver
1,940761,Brenda,,60.0,,51063.0,Stonewall
2,428945,Joe,,,,,Michigantown
3,408351,Diane,,51.0,18.3,,Hydetown
4,193819,Benjamin,40.31,,,117642.0,Fremont
5,499687,Patrick,34.86,58.0,12.02,72305.0,Macksburg
6,539712,Nancy,22.14,,0.87,72305.0,Atlanta
7,380086,Carol,59.12,40.0,34.52,60918.0,Blanchester
8,477616,Frances,,42.0,34.52,60918.0,Delmita
9,162402,Diana,59.12,60.0,3.44,43010.0,Eureka Springs


  emp_df.std()


Emp ID            230983.385602
Age in Yrs            14.198590
Weight in Kgs          8.635475
Age in Company        13.047018
Salary             38753.299000
dtype: float64

In [210]:
emp_df['Age in Yrs'].std()

9.621435290301262

In [212]:
emp_df['Age in Yrs'].var()

92.57201704545453

In [214]:
emp_df['Age in Yrs'].min()

22.14

In [216]:
emp_df['Age in Yrs'].max()

59.12

In [217]:
emp_df.describe()

Unnamed: 0,Emp ID,Age in Yrs,Weight in Kgs,Age in Company,Salary
count,12.0,12.0,12.0,12.0,12.0
mean,424528.833333,35.90375,55.666667,14.022222,82703.666667
std,230983.385602,9.621435,7.315405,11.126534,35388.597412
min,153989.0,22.14,40.0,0.56,43010.0
25%,222056.5,33.5775,56.25,7.0775,60918.0
50%,418648.0,35.90375,59.0,13.851111,72305.0
75%,509693.25,37.3475,60.0,15.091667,91634.25
max,940761.0,59.12,61.0,34.52,168251.0
