# Machine Learning - Data Preprocessing

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
# preparation phase of a machine learning process, specifically for supervised learning
# 1. Reading the Dataset:
dataset = pd.read_csv('Data.csv')

#2. Separating Features (Predictors - independent variables):
'''
This line extracts all columns except the last one as features (predictors).
The .iloc[:, :-1] selects all rows (:) and all columns except the last one (:-1).
The .values at the end converts the DataFrame slice into a NumPy array,
which is often used for feeding data into machine learning models.
'''
X = dataset.iloc[:, :-1].values #feature

#3. Separating Target Variable:
'''
This line extracts the last column of the DataFrame as the target variable (y).
The .iloc[:, -1] specifically selects all rows and only the last column,
which is typically the variable you want to predict.
Again, .values converts it into a NumPy array.
'''

y = dataset.iloc[:, -1].values #target

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [None]:
#1. Importing the SimpleImputer Class:
'''
This imports the SimpleImputer class, which provides basic strategies for imputing missing values,
including using the mean, median, mode, or a constant value.
'''
from sklearn.impute import SimpleImputer # Sci Kit learn
#2. Creating an Imputer Instance:
'''
missing_values=np.nan: This parameter specifies what the imputer should treat as a missing value.
Here, np.nan is used to indicate that any NaN value in the data should be considered missing.
strategy='mean': This sets the imputation strategy.
'mean' means that the imputer will replace missing values using the mean value of each column
where the missing values are located.
'''
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#3. Fitting the Imputer:
'''
This line applies the fit method of the imputer to a slice of the data (X[:, 1:3]),
which includes only the second and third columns of X (Python uses 0-based indexing).
The fit method calculates the necessary statistics (in this case, the mean) for each column
that it will later use to perform the actual imputation.
This calculation is based only on the columns specified.
'''
imputer.fit(X[:, 1:3])
#4. Transforming the Data:
'''
After fitting, this line uses the transform method to replace the missing values in the original data slice (X[:, 1:3])
with the means computed by the fit method.
The result is that the original missing values in these columns are now filled with their respective mean values.
'''
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data
performing on categorical values only

### - Encoding the Independent Variable

In [None]:
'''
The code you've provided deals with categorical data transformation using the ColumnTransformer
and OneHotEncoder from Scikit-learn's preprocessing
'''
# 1. Importing Necessary Classes:
'''
-ColumnTransformer: This class allows different columns of the input data
to be transformed in different ways, which is essential
when you have both numerical and categorical data.
'''
from sklearn.compose import ColumnTransformer
'''
-OneHotEncoder: This class transforms categorical variables into a form that could be provided
to ML algorithms to do a better job in prediction.
'''
from sklearn.preprocessing import OneHotEncoder

#2. Setting Up the Column Transformer:
'''
- transformers: This parameter specifies the transformations to apply.
Each transformer is a tuple containing:
A name ('encoder' in this case, but it's arbitrary and can be anything descriptive).
The transformation to apply (OneHotEncoder() here, which encodes categorical features as a one-hot numeric array).
The column indices to apply this transformation to ([0] indicating the first column).
- remainder='passthrough': This tells the ColumnTransformer to pass through the other columns
of the dataset without transforming them. Only the specified columns ([0] in this case) will be transformed.

'''
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

#3. Applying the Transformation:
'''
- fit_transform(): This method first fits the transformer (calculates the necessary parameters for transformation) and then transforms the data.
It's applied to the entire X dataset but only transforms the columns specified.
- np.array(...): The result of fit_transform() is converted into a NumPy array. This is common practice if you want to ensure the output is suitable for use with other Scikit-learn utilities or ML algorithms, as they typically operate on NumPy arrays.
'''
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [None]:
'''
Uses the LabelEncoder class from the Scikit-learn library to encode target labels with value between 0 and n_classes-1.
This is often used in machine learning when the target variable y is categorical (non-numeric).
'''
#1. Importing the LabelEncoder Class:
'''
This imports the LabelEncoder class, which is designed to normalize labels
such that they contain only values between 0 and the number of classes minus one.
'''
from sklearn.preprocessing import LabelEncoder
#2. Creating an Instance of LabelEncoder:
'''
This line creates an instance of LabelEncoder named le.
This instance will be used to fit to the data and then transform it.
'''
le = LabelEncoder()
#3. Fitting the Encoder and Transforming the Target Variable:
'''
- fit_transform(): This method first fits the label encoder to y (learning the labels that exist within y)
 and then transforms y into an array of integers.
 Each unique label in y is assigned a unique integer based on alphabetical order.
'''
y = le.fit_transform(y)

'''
Outcome: After this transformation, your target variable y will consist solely of integer labels.
This is particularly necessary for machine learning models in Scikit-learn that require the target input
to be numeric. It’s commonly used in classification tasks where the target labels are nominal
(e.g., types of species, categories of products). This approach simplifies handling categorical labels
and is a prerequisite for most Scikit-learn classifiers that do not natively handle categorical data.
'''


'\nOutcome: After this transformation, your target variable y will consist solely of integer labels. \nThis is particularly necessary for machine learning models in Scikit-learn that require the target input \nto be numeric. It’s commonly used in classification tasks where the target labels are nominal \n(e.g., types of species, categories of products). This approach simplifies handling categorical labels \nand is a prerequisite for most Scikit-learn classifiers that do not natively handle categorical data.\n'

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [None]:
# 1. Importing the Function:
'''
This line imports the train_test_split function from the model_selection module
of the sklearn (Scikit-learn) library.
This function is specifically designed to randomly partition the data into training and testing sets.
'''
from sklearn.model_selection import train_test_split
#2. Splitting the Data:
'''
test_size=0.2: This argument specifies that 20% of the data will be set aside as the test dataset.
Accordingly, the remaining 80% will be used for training the model.
random_state=0: This is a seed value for random number generation, ensuring the split is reproducible.
It means that every time you run this code with the same random_state,
you'll get the same split, which is helpful for debugging and comparing model performance across different runs.
'''
'''
Result: The function returns four subsets:

X_train: The features for training the model.
X_test: The features for testing the model.
y_train: The target variable for training.
y_test: The target variable for testing.
These subsets are used to train a model on X_train and y_train,
and then to test it on X_test and y_test to evaluate its performance.
This practice helps in understanding how well the model is likely to perform on unseen data,
thereby providing an estimation of its generalization ability.
'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
print(X_train)

[[0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 50.0 83000.0]]


In [None]:
print(y_train)

[1 1 1 0 1 0 0 1]


In [None]:
print(y_test)

[0 0]


## Feature Scaling

In [None]:
'''
The code you've shared is used to standardize (scale) features of a dataset using the StandardScaler from Scikit-learn.
This preprocessing step is crucial for many machine learning algorithms that are sensitive to the scale of input features,
such as support vector machines and k-nearest neighbors.
'''
#1.Importing the StandardScaler Class:
'''
This imports the StandardScaler class, which standardizes features by removing
the mean and scaling to unit variance.
This is done feature-wise (independently for each feature)
by computing the relevant statistics on the samples in the training set.
'''
from sklearn.preprocessing import StandardScaler

#2. Creating an Instance of StandardScaler:
'''
This line creates an instance of StandardScaler called sc.
This object will be used to compute the mean and standard deviation on a set of data,
which can then be used to scale the data.
'''
sc = StandardScaler()

#3. Fitting the Scaler to the Training Data and Transforming:
'''
-fit_transform(): This method fits the scaler to the data by calculating the mean
and standard deviation of each feature, and then transforms the data by standardizing it
using these calculated values.
This method is applied to the columns from the third to the last in X_train.
The slicing [:, 3:] indicates that only the features from the third column onward are being scaled.
The transformed data (standardized) replaces the original data in these columns.
'''
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

#4. Transforming the Test Data:
'''
- transform(): This method uses the same mean and standard deviation,
which were computed from the training data, to standardize the test data.
It’s crucial to use the same scaling parameters on the test data
to ensure consistency between the training and testing phases.
This helps in avoiding any bias that might occur due to different scaling.
As with the training set, this scaling affects only the columns from the third to the last in X_test.
'''

X_test[:, 3:] = sc.transform(X_test[:, 3:])

'''
Outcome: Both the training and test datasets are scaled so that each feature (from the third column onward)
now has zero mean and unit variance.
This makes the algorithm less likely to be skewed by features with larger ranges
and improves the performance and stability of many machine learning algorithms.
'''

'\nOutcome: Both the training and test datasets are scaled so that each feature (from the third column onward) \nnow has zero mean and unit variance. \nThis makes the algorithm less likely to be skewed by features with larger ranges \nand improves the performance and stability of many machine learning algorithms.\n'

In [None]:
print(X_train)

[[0.0 1.0 0.0 0.2630675731713538 0.1238147854838185]
 [1.0 0.0 0.0 -0.25350147960148617 0.4617563176278856]
 [0.0 0.0 1.0 -1.9753983221776195 -1.5309334063940294]
 [0.0 0.0 1.0 0.05261351463427101 -1.1114197802841526]
 [1.0 0.0 0.0 1.6405850472322605 1.7202971959575162]
 [0.0 0.0 1.0 -0.08131179534387283 -0.16751412153692966]
 [1.0 0.0 0.0 0.9518263102018072 0.9861483502652316]
 [1.0 0.0 0.0 -0.5978808481167128 -0.48214934111933727]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 -1.4588292694047795 -0.9016629672292141]
 [0.0 1.0 0.0 1.984964415747487 2.139810822067393]]
