<a href="https://colab.research.google.com/github/Sanjana060101/ML-/blob/main/2_1_data_preprocessing_tools_NOTEBOOK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[: , -1].values

*NOTES IN THE python NOTEBOOK*
*FOR ABOVE TWO CELLS*

In [None]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [None]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3]) # applying this input object on the matrix of features X(all rows but only column 1 , 2(only the numerical value columns) ) 
# using fit() method (operations and actions ) from the class
# AGE , SALARY COLUMN IS SELECTED AS RANGE = 1:3
# fit(X[ , y]) = Fit the imputer on X
# fir() method looks for all the missing values in the salar column and also computes the mean of the salary column


X[:, 1:3] = imputer.transform(X[:, 1:3]) # calling this from the object imputer to do the replacement in X

# transform(X) = Impute all missing values in X
# performs the transformation , meaning , it will replace all the missing values here with the mean
# imputer.transform(X[:, 1:3]) returns the two columns(1,2) with the replacement done , so we have to update our matrix feature X, that's why
# we're storing the updated column in our matrix feature 

 ***IMPUTER IS AN OBJECT OF THE CLASS SIMPLEIMPUTER***
 ***MISSING_VALUES = NP.NAN , HERE WE ARE SPECIFYING THE MISSING***  ***VALUES WE WANT TO DEAL WITH , WHICH ARE EMPTY (NP.NAN)*** 


In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer # from library sklearn , imported module compose and from that we imported ColumnTransformer class 
from sklearn.preprocessing import OneHotEncoder # imported OneHotEncoder class
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') # made an object ct , as OneHotEncoder is a class use()

# ColumnTransformer class = Parameters = transformers : list of tuples , remainder : {‘drop’, ‘passthrough’} or estimator, default=’drop’
# By default, only the specified columns in transformers are transformed and combined in the output, and the non-specified columns are dropped. 
# list of tuples => kind of transformation = encoding , what kind of encoding = one hot encoding , index of columns where to apply = [0]


X = np.array(ct.fit_transform(X))
# ColumnTransformer class has fit_tranformer() method that performs the desired step ek sath , then making it a numpy array

# Put the returned result back into X , don't forget to update X

In [None]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# can use lebel encoding when we have only 2 classes which we can encode into 0 and 1 only

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# X_train and all others are numpy arrays 

# train_test_split : returns = output type is the same as the input type
# inputs : Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes.

In [None]:

X_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [None]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [None]:
print(y_test)

[0 1]


## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# we will be applying feature scaling only on the age and salary columns because the dummy variables representing countries 
# already have some codes associated with them which will get destroyed after this 
# two ways to feature scale = standaridation ( range -3 to 3 ) and normalisation ( range 0 to 1 )


X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) # fit() computes the mean and standard deviaton of all the features denoted by selected columns
# like age and salary , where as transform will apply the formula of standaridation and will tranform the values accordingly 


X_test[:, 3:] = sc.transform(X_test[:, 3:]) 
# we will only use transform here because as it is supposed to be new , we can't have a new scaler for this because we should use the same scaler 
# and then see how it is working on our test set

In [None]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
