# Lab 2 : Data Preprocessing tools

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Import Dataset

In [2]:
dataset = pd.read_csv("Data.csv")

In [3]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:
dataset.tail()

Unnamed: 0,Country,Age,Salary,Purchased
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
dataset.shape

(10, 4)

In [6]:
dataset.columns

Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [8]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


## Preprocessing Steps

### Step 1: Divide dataframe into Independent variable/input and dependent/output features

In [9]:
X = dataset.iloc[:,:-1]
Y = dataset.iloc[:,-1]

In [10]:
print(X)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [11]:
print(Y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


### Step 2: Handle the missing values in Dataset

In [12]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X.iloc[:,1:3])
X.iloc[:,1:3] = imputer.transform(X.iloc[:,1:3])

In [13]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


### Step 3: Encoding Categorical Data

In [14]:
dataset['Country'].value_counts()

France     4
Spain      3
Germany    3
Name: Country, dtype: int64

In [15]:
dataset['Purchased'].value_counts()

No     5
Yes    5
Name: Purchased, dtype: int64

**A. Encoding the Independent Variable (i/p feature/X)**

In X we have Country as categorical feature

It has 3 categories

Hence used One hot encoder

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(),[0])], remainder = 'passthrough')

X = np.array(ct.fit_transform(X))


In [17]:
print(X)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]


**B. Encoding the Dependent Variable (o/p feature/target/X)**

Here are only two categories in Y i.e. Yes/No

Hence used Label encoder

In [18]:
from sklearn.preprocessing import LabelEncoder

le =LabelEncoder()

Y = le.fit_transform(Y)

In [19]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


### Step 4: Splitting Data into Training and Testing

In [20]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 3)

In [21]:
print(X_train)

[[0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]]


### Step 5: Feature Scaling

In [22]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.fit_transform(X_test[:,3:])

In [23]:
print(X_test)

[[ 1.  0.  0. -1. -1.]
 [ 0.  1.  0.  1.  1.]]


# Test your Knowledge

### Q.2	Write a code to Feature Scaling our dataset numerical variable using MinMaxScaler 

In [24]:
from sklearn.preprocessing import MinMaxScaler

# Create MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler on X_train
scaler.fit(X_train)

# Transform X_train and X_test using the fitted scaler
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

# Print the scaled data
print("MinMaxScaler Scaled X_train:")
print(scaled_X_train)
print("\n=========================================================== \n")
print("MinMaxScaler Scaled X_test:")
print(scaled_X_test)

MinMaxScaler Scaled X_train:
[[0.         0.         1.         0.         0.        ]
 [0.         1.         0.         0.13043478 0.17142857]
 [1.         0.         0.         0.43478261 0.54285714]
 [0.         0.         1.         0.51207729 0.11428571]
 [1.         0.         0.         0.91304348 0.88571429]
 [1.         0.         0.         0.73913043 0.68571429]
 [0.         0.         1.         0.47826087 0.37142857]
 [0.         1.         0.         1.         1.        ]]


MinMaxScaler Scaled X_test:
[[1.         0.         0.         0.19736221 0.12649439]
 [0.         1.         0.         0.85457015 0.81636276]]


### Q. 3 For the below student dataset remove missing values from column ‘gender’ and ‘marks’

In [25]:
import pandas as pd 
import numpy as np 
students = [[85, 'M', 'verygood'], 
            [95, 'F', 'excellent'], 
            [75, None,'good'], 
            [np.NaN, 'M', 'average'], 
            [70, 'M', 'good'], 
            [np.NaN, None, 'verygood'], 
            [92, 'F', 'verygood'],
            [98, 'M', 'excellent']] 
dfstd = pd.DataFrame(students) 
dfstd.columns = ['marks', 'gender', 'result'] 

print(dfstd)


   marks gender     result
0   85.0      M   verygood
1   95.0      F  excellent
2   75.0   None       good
3    NaN      M    average
4   70.0      M       good
5    NaN   None   verygood
6   92.0      F   verygood
7   98.0      M  excellent


##### Using Step 2: Handle the missing values in Dataset

In [26]:
from sklearn.impute import SimpleImputer

In [27]:
# Handling the missing values 
# for Marks column adding mean values to missing places.

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(dfstd.iloc[:,0:1])
dfstd.iloc[:,0:1] = imputer.transform(dfstd.iloc[:,0:1])

In [28]:
print(dfstd)

       marks gender     result
0  85.000000      M   verygood
1  95.000000      F  excellent
2  75.000000   None       good
3  85.833333      M    average
4  70.000000      M       good
5  85.833333   None   verygood
6  92.000000      F   verygood
7  98.000000      M  excellent


In [29]:
# Handling the missing values 
# for gender column adding most_frequent values to missing places.

imputer1 = SimpleImputer(missing_values = None, strategy = 'most_frequent') 
imputer1.fit(dfstd.iloc[:,1:2])
dfstd.iloc[:,1:2] = imputer1.transform(dfstd.iloc[:,1:2])

In [30]:
print(dfstd)

       marks gender     result
0  85.000000      M   verygood
1  95.000000      F  excellent
2  75.000000      M       good
3  85.833333      M    average
4  70.000000      M       good
5  85.833333      M   verygood
6  92.000000      F   verygood
7  98.000000      M  excellent


### Q. 4 For dataset given in above Apply the proper Categorical Encoder to encode column ‘gender’ and ‘result’.

##### Using Step 3: Encoding Categorical Data

**for Gender:** 

Here we have only two categories i.e. Male/Female

Hence used Label encoder


In [31]:
dfstd['gender'].value_counts()

M    6
F    2
Name: gender, dtype: int64

In [32]:
from sklearn.preprocessing import LabelEncoder

label =LabelEncoder()

dfstd['gender'] = label.fit_transform(dfstd['gender'])

**for Result:**

Here we have Four categories i.e. verygood,excellent,good,average

Hence used OneHot encoder

0 => average ; 1 => excellent ; 2 => good ; 3 => verygood

In [33]:
dfstd['result'].value_counts()

verygood     3
excellent    2
good         2
average      1
Name: result, dtype: int64

In [34]:
# from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Apply OneHotEncoder to 'result' column
encode = OneHotEncoder(sparse=False)
encoded_result = encode.fit_transform(dfstd[['result']])
dfstd = dfstd.drop(columns=['result'])  
dfstd = pd.concat([dfstd, pd.DataFrame(encoded_result)], axis=1)  

**Output**

In [35]:
dfstd

Unnamed: 0,marks,gender,0,1,2,3
0,85.0,1,0.0,0.0,0.0,1.0
1,95.0,0,0.0,1.0,0.0,0.0
2,75.0,1,0.0,0.0,1.0,0.0
3,85.833333,1,1.0,0.0,0.0,0.0
4,70.0,1,0.0,0.0,1.0,0.0
5,85.833333,1,0.0,0.0,0.0,1.0
6,92.0,0,0.0,0.0,0.0,1.0
7,98.0,1,0.0,1.0,0.0,0.0
