In [2]:
import pandas as pd 
import numpy as np


In [3]:
from sklearn.impute import SimpleImputer

#An imputer is used to handle missing data in a dataset. Missing values can skew analysis and lead to incorrect conclusions, so imputers help maintain data integrity. Here's why they're essential:

Data Completeness: Imputers fill in the missing values, ensuring the dataset is complete and usable.

Consistency: They provide consistent methods for handling missing data, avoiding biased results.

Efficiency: By imputing missing values, you avoid losing valuable data that can happen if you drop rows or columns with missing values.

Algorithm Requirements: Many machine learning algorithms require complete datasets to function properly.

Common imputation strategies include:

Mean/Median/Mode Imputation: Replacing missing values with the mean, median, or mode of the column.

Forward/Backward Fill: Using adjacent values to fill in missing entries.

Predictive Modeling: Using algorithms to predict and fill missing values based on other data.

Tools like SimpleImputer from sklearn make this process straightforward. Got a dataset you’re looking to clean up?



In [4]:
!pip install scikit-learn



In [6]:
df = pd.read_csv("Data.csv")

# NUMERIC NULL VALUES HANDLING

In [7]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# 1st Method (SIMPLE DROPNA)






In [9]:
df.dropna()#1st approach is removing the rows

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# 2nd Method (IMPUTER)

In [10]:
imputer = SimpleImputer(missing_values = np.nan , strategy = 'mean')

In [11]:
df['Salary']

0    72000.0
1    48000.0
2    54000.0
3    61000.0
4        NaN
5    58000.0
6    52000.0
7    79000.0
8    83000.0
9    67000.0
Name: Salary, dtype: float64

## The iloc function in pandas is used for integer-location based indexing to select by position.

Here’s what you can do with iloc:

Select Rows: df.iloc[0] selects the first row of the DataFrame.

Select Columns: df.iloc[:, 1] selects the second column of the DataFrame.

Select Specific Rows and Columns: df.iloc[0, 1] selects the element at the first row and second column.

Slice DataFrames: df.iloc[0:3, 1:4] selects the first three rows and columns from the second to the fourth.

It’s powerful for precise control over your data selection, especially when you need to work based on positions rather than labels



In [12]:
x = df.iloc[:,1:3].values

In [13]:
imputer = SimpleImputer(missing_values = np.nan , strategy = 'mean')
imputer.fit(df.iloc[:,1:3].values)

df.iloc[:,1:3] =  imputer.transform(df.iloc[:,1:3].values)

## The fit method calculates the most frequent value in the first column of your DataFrame df (the df.iloc[:, :1].values part selects all rows and the first column, and .values converts it to a numpy array). Essentially, this step prepares the imputer to perform the replacement operation based on the most frequent value it finds.



In [14]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [15]:
x

array([[4.4e+01, 7.2e+04],
       [2.7e+01, 4.8e+04],
       [3.0e+01, 5.4e+04],
       [3.8e+01, 6.1e+04],
       [4.0e+01,     nan],
       [3.5e+01, 5.8e+04],
       [    nan, 5.2e+04],
       [4.8e+01, 7.9e+04],
       [5.0e+01, 8.3e+04],
       [3.7e+01, 6.7e+04]])

# CATEGORICAL NULL VALUES HANDLING


In [23]:
imputer = SimpleImputer(missing_values = np.nan , strategy = 'most_frequent')
imputer.fit(df.iloc[:,:1].values)

df.iloc[:,:1] =  imputer.transform(df.iloc[:,:1].values)

In [24]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,France,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes
