In [1]:
# Importing pandas for data manipulation and analysis
import pandas as pd

# Importing numpy for numerical operations and array handling
import numpy as np

# Importing matplotlib for data visualization (e.g., plotting graphs)
import matplotlib.pyplot as plt


In [2]:
# Loading the dataset from a CSV file to begin data processing and analysis
dataset = pd.read_csv('data.csv')


In [3]:
dataset


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
# Selecting all rows and all columns except the last one as input features (independent variables) for model training
x = dataset.iloc[:, :-1].values

In [5]:
# Selecting all rows and only the last column as the target variable (dependent variable)
y = dataset.iloc[:, -1].values


In [6]:
x


array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [7]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [8]:
# Importing SimpleImputer to handle missing data by replacing them with statistical values like mean, median, or most frequent
from sklearn.impute import SimpleImputer


In [9]:
# Creating an imputer object to replace missing values (NaN) with the mean of the respective column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')


In [10]:
# Fitting the imputer on columns 1 and 2 to calculate their mean values for replacing missing data
imputer.fit(x[:, 1:3])


In [11]:
# Applying the calculated mean values to replace missing data in columns 1 and 2
x[:, 1:3] = imputer.transform(x[:, 1:3])


In [13]:
#new values are added in the missing values
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [23]:
# Importing LabelEncoder to convert categorical text data into numeric labels
# Importing OneHotEncoder to create binary columns for each category (one-hot encoding)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [31]:
# Creating a LabelEncoder to convert the 'Country' column (categorical) into numerical labels
label_encoder_x = LabelEncoder()

# Applying label encoding to the first column (e.g., 'Country') in the dataset
x[:,0] = label_encoder_x.fit_transform(x[:,0])

# Creating a OneHotEncoder to convert the 'Country' column into binary (one-hot) encoded format
onehotencoder = OneHotEncoder()

# Applying one-hot encoding to the 'Country' column from the original dataset and converting it into an array
x = onehotencoder.fit_transform(dataset.Country.values.reshape(-1, 1)).toarray().astype(int)


In [32]:
x

array([[1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0]])

In [30]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [33]:
# Importing train_test_split to split the dataset into training and testing sets for model evaluation
from sklearn.model_selection import train_test_split


In [34]:
# Splitting the dataset into training (80%) and testing (20%) sets to evaluate model performance on unseen data
# random_state=0 ensures reproducibility of the split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [35]:
x_train


array([[0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0]])

In [36]:
x

array([[1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0]])

In [37]:
# Importing StandardScaler to standardize features by removing the mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler


In [38]:
# Creating a StandardScaler object to standardize feature values
scaler = StandardScaler()

# Fitting the scaler on training data and transforming it to have mean=0 and variance=1
x_train = scaler.fit_transform(x_train)  # Fit scaler on training data and transform it

# Transforming the test data using the same scaler parameters (mean and variance) from training data
x_test = scaler.transform(x_test)  # Use the same scaler to transform test data


In [39]:
x_train

array([[-1.        ,  2.64575131, -0.77459667],
       [ 1.        , -0.37796447, -0.77459667],
       [-1.        , -0.37796447,  1.29099445],
       [-1.        , -0.37796447,  1.29099445],
       [ 1.        , -0.37796447, -0.77459667],
       [-1.        , -0.37796447,  1.29099445],
       [ 1.        , -0.37796447, -0.77459667],
       [ 1.        , -0.37796447, -0.77459667]])

In [40]:
x_test

array([[-1.        ,  2.64575131, -0.77459667],
       [-1.        ,  2.64575131, -0.77459667]])