## Step 1: Importing the libraries

In [104]:
import numpy as np
from ml_code.utils import load_data

## Step 2: Importing dataset

In [105]:
dataset = load_data("basic_data.csv")
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


### Step 2.1: Extracting features and labels

In [106]:
features = dataset.iloc[:, :-1].values
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [107]:
labels = dataset.iloc[:, -1].values
labels

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Step 3: Preprocessing the missing data

### Step 3.1: Handling missing data

In [108]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer.fit(features[:, 1:3])
features[:, 1:3] = imputer.transform(features[:, 1:3])

features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Step 3.2: Encoding the features

In [109]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Transform the categorical features using one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(), [0]),
        ("passthrough", "passthrough", [1, 2]),
    ]
)

features_encoded = preprocessor.fit_transform(features)
features_encoded

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Step 3.3: Encoding the labels

In [110]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

In [111]:
labels_encoded

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Step 4: Splitting the dataset into the Training set and Test set

In [112]:
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(
    features_encoded, labels_encoded, test_size=0.2, random_state=0
)

print(
    f"Training data shape: {features_train.shape}\nTest data shape: {features_test.shape}"
)
print(
    f"Training labels shape: {labels_train.shape}\nTest labels shape: {labels_test.shape}"
)

print(f"Example of training data: {features_train[0]}")

Training data shape: (8, 5)
Test data shape: (2, 5)
Training labels shape: (8,)
Test labels shape: (2,)
Example of training data: [0.0 1.0 0.0 40.0 63777.77777777778]


## Step 5: Feature Scaling

'Age' and 'Salary' columns have different ranges of values. 'Age' ranges from 27 to 50, while 'Salary' ranges from 48000 to 83000. Min-Max scaling will scale both features to a similar range (typically between 0 and 1), making them comparable and preventing one feature from dominating the other due to the difference in magnitude.

In [114]:
from sklearn.preprocessing import MinMaxScaler

scaler = ColumnTransformer(
    transformers=[
        ("scaler", MinMaxScaler(), [3, 4]),
        ("passthrough", "passthrough", [0, 1, 2]),
    ]
)

features_train_scaled = scaler.fit_transform(features_train)
features_test_scaled = scaler.transform(features_test)

array([[0.6190476190476191, 0.5089605734767026, 0.0, 1.0, 0.0],
       [0.4761904761904763, 0.612903225806452, 1.0, 0.0, 0.0],
       [0.0, 0.0, 0.0, 0.0, 1.0],
       [0.5608465608465609, 0.12903225806451624, 0.0, 0.0, 1.0],
       [1.0, 1.0, 1.0, 0.0, 0.0],
       [0.5238095238095237, 0.4193548387096775, 0.0, 0.0, 1.0],
       [0.8095238095238093, 0.774193548387097, 1.0, 0.0, 0.0],
       [0.38095238095238093, 0.3225806451612905, 1.0, 0.0, 0.0]],
      dtype=object)