In [6]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [11]:
# Load the dataset
dataset = pd.read_csv(r"C:\Users\user\Data.csv")  # Use raw string to avoid escape issues

# Separate features (X) and target variable (y)
#slicing
P = dataset.iloc[:, :-1].values  # All rows, all columns except the last
Q = dataset.iloc[:, -1].values   # All rows, only the last column
R = dataset.iloc[:, :].values   # All rows, all columns

In [12]:
print(P)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [13]:
print(Q)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [14]:
print(R)

[['France' 44.0 72000.0 'No']
 ['Spain' 27.0 48000.0 'Yes']
 ['Germany' 30.0 54000.0 'No']
 ['Spain' 38.0 61000.0 'No']
 ['Germany' 40.0 nan 'Yes']
 ['France' 35.0 58000.0 'Yes']
 ['Spain' nan 52000.0 'No']
 ['France' 48.0 79000.0 'Yes']
 ['Germany' 50.0 83000.0 'No']
 ['France' 37.0 67000.0 'Yes']]


In [15]:
# Import SimpleImputer for handling missing data
from sklearn.impute import SimpleImputer
X = np.array([
    ['France', 44.0, 72000.0, 'No'],
    ['Spain', 27.0, 48000.0, 'Yes'],
    ['Germany', 30.0, 54000.0, 'No'],
    ['Spain', 38.0, 61000.0, 'No'],
    ['Germany', 40.0, np.nan, 'Yes'],
    ['France', 35.0, 58000.0, 'Yes'],
    ['Spain', np.nan, 52000.0, 'No'],
    ['France', 48.0, 79000.0, 'Yes'],
    ['Germany', 50.0, 83000.0, 'No'],
    ['France', 37.0, 67000.0, 'Yes']
], dtype=object)
# dtype=object allows you to store strings, floats, integers, and other data types together in one array.

# Replace missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# missing_values=np.nan: Specifies that missing values in the dataset are represented as np.nan (NumPy's "Not a Number").
# strategy='mean': Specifies that missing values should be replaced with the mean of the corresponding column


imputer.fit(X[:, 1:3])  # Fit imputer on the 2nd and 3rd columns (Age, Salary)
# :: Refers to all rows (select all rows of the matrix).
# 1:3: Refers to the 2nd and 3rd columns (column indices 1 and 2).
# Example:
# Index-Column
# 0-Country
# 1-Age
# 2-Salary
# 3-Purchased
# The syntax 1:3 is a slice that includes:
# Start index (1): The slicing starts at column index 1 (inclusive).
# End index (3): The slicing stops just before column index 3 (exclusive).

X[:, 1:3] = imputer.transform(X[:, 1:3])  # Apply transformation
# .transform(): This replaces the missing values in the selected columns with the mean (or the value calculated during the .fit() step).
# Assignment to X[:, 1:3]: Updates the original dataset X with the imputed values.

In [16]:
print(X)

[['France' 44.0 72000.0 'No']
 ['Spain' 27.0 48000.0 'Yes']
 ['Germany' 30.0 54000.0 'No']
 ['Spain' 38.0 61000.0 'No']
 ['Germany' 40.0 63777.77777777778 'Yes']
 ['France' 35.0 58000.0 'Yes']
 ['Spain' 38.77777777777778 52000.0 'No']
 ['France' 48.0 79000.0 'Yes']
 ['Germany' 50.0 83000.0 'No']
 ['France' 37.0 67000.0 'Yes']]


In [17]:
type(X) #X is in array format

numpy.ndarray

In [18]:
#missing numerical data can be easily replaced but what if, the country name is missing then how will the python package will know there is a missing value in country, we will use countries as categorical data, we will encode the 3 countries(germany, spain ,france) into categories
#encoding categorical data
#encoding independent variable
from sklearn.compose import ColumnTransformer
# ColumnTransformer: This class helps in applying different preprocessing steps to specific columns in the dataset.

from sklearn.preprocessing import OneHotEncoder
# one-hot encoding converts each category (e.g., 'France', 'Spain', 'Germany') into a separate column and assigns binary values (0 or 1) based on the presence of that category for each row.
# If you have a "Country" column with three possible values: 'France', 'Spain', 'Germany', it will be converted into three columns:
# One column for France, One column for Spain, One column for Germany
# For a row where the country is France, it would look like:
# [1, 0, 0]
# For a row where the country is Spain, it would look like:
# [0, 1, 0]


ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# ColumnTransformer: This class helps in applying different preprocessing steps to specific columns in the dataset.
# transformers=[('encoder', OneHotEncoder(), [0])]:
# The first argument, encoder, is a name for the transformer. It’s used for reference and doesn’t affect the process.
# OneHotEncoder() specifies that the OneHotEncoder should be applied to the column(s).
# [0]: This is the index of the column to which the encoder should be applied. The value 0 refers to the first column (in your case, the "Country" column).
# remainder='passthrough':
# This means that all columns other than the ones selected for transformation (the "Country" column in this case) will be left unchanged and passed through without any transformation

X = np.array(ct.fit_transform(X))
# ct.fit_transform(X):
# This fits the one-hot encoder on the data (learns how to encode the country column), then transforms the dataset (X).The result is an array where the first column (Country) is encoded into several binary columns, and the rest of the data remains the same.

# np.array(ct.fit_transform(X)):
# A sparse matrix is a matrix where most of the elements are zero. It is a memory-efficient way of storing such matrices. Instead of storing all zeros, sparse matrices only store the non-zero elements along with their indices.eg{row: [column], value: [value]}
# libraries like scikit-learn store the result of transformations like one-hot encoding as a sparse matrix by default to save memory.
# When you call np.array() on the result of fit_transform(), it converts the sparse matrix into a dense matrix (a standard NumPy array)


print(X)

[[1.0 0.0 0.0 44.0 72000.0 'No']
 [0.0 0.0 1.0 27.0 48000.0 'Yes']
 [0.0 1.0 0.0 30.0 54000.0 'No']
 [0.0 0.0 1.0 38.0 61000.0 'No']
 [0.0 1.0 0.0 40.0 63777.77777777778 'Yes']
 [1.0 0.0 0.0 35.0 58000.0 'Yes']
 [0.0 0.0 1.0 38.77777777777778 52000.0 'No']
 [1.0 0.0 0.0 48.0 79000.0 'Yes']
 [0.0 1.0 0.0 50.0 83000.0 'No']
 [1.0 0.0 0.0 37.0 67000.0 'Yes']]


In [22]:
#how will the python package will know there is a missing value in yes/no column, we will use yes/no as categorical data
#encoding dependent variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# Initialize the LabelEncoder
# The LabelEncoder automatically assigns values as follows:
# "No" → 0
# "Yes" → 1

Q = le.fit_transform(Q)
# The fit() function identifies the unique labels in the input data and assigns each label a numeric value.
# The transform() function converts the input labels into their corresponding numeric values based on the mapping created by fit().
# fit_transform() combines both steps into one operation.

print(Q)

[0 1 0 0 1 1 0 1 0 1]


In [27]:
#splitting dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Q_train, Q_test = train_test_split(X, Q, test_size=0.2, random_state=1)
# random_state=1, randomizes the data and then split into training and testing
# test_size=0.2 , generally we do 80% training set and 20% testing set
#X_train, X_test- traineng and testing of independent variables
#y_train, y_test- traineng and testing of dependent variables
print('X_train:', X_train,'\n')
print('X_test:',X_test,'\n')
print('Q_train:',Q_train,'\n')
print('Q_test:',Q_test,'\n')


X_train: [[0.0 0.0 1.0 38.77777777777778 52000.0 'No']
 [0.0 1.0 0.0 40.0 63777.77777777778 'Yes']
 [1.0 0.0 0.0 44.0 72000.0 'No']
 [0.0 0.0 1.0 38.0 61000.0 'No']
 [0.0 0.0 1.0 27.0 48000.0 'Yes']
 [1.0 0.0 0.0 48.0 79000.0 'Yes']
 [0.0 1.0 0.0 50.0 83000.0 'No']
 [1.0 0.0 0.0 35.0 58000.0 'Yes']] 

X_test: [[0.0 1.0 0.0 30.0 54000.0 'No']
 [1.0 0.0 0.0 37.0 67000.0 'Yes']] 

Q_train: [0 1 0 0 1 1 0 1] 

Q_test: [0 1] 



In [35]:
# feature scaling
#applying standarddization
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Scale only the numeric columns (Age and Salary, indexes 1 and 2)
X_train[:, 1:3] = sc.fit_transform(X_train[:, 1:3])  # Columns at index 1 and 2 (Age and Salary)
X_test[:, 1:3] = sc.transform(X_test[:, 1:3])  # Apply the same transformation to X_test

In [32]:
print("X_train after scaling:\n", X_train)

X_train after scaling:
 [[0.0 -0.5773502691896258 1.2909944487358058 38.77777777777778 52000.0
  'No']
 [0.0 1.7320508075688774 -0.7745966692414835 40.0 63777.77777777778 'Yes']
 [1.0 -0.5773502691896258 -0.7745966692414835 44.0 72000.0 'No']
 [0.0 -0.5773502691896258 1.2909944487358058 38.0 61000.0 'No']
 [0.0 -0.5773502691896258 1.2909944487358058 27.0 48000.0 'Yes']
 [1.0 -0.5773502691896258 -0.7745966692414835 48.0 79000.0 'Yes']
 [0.0 1.7320508075688774 -0.7745966692414835 50.0 83000.0 'No']
 [1.0 -0.5773502691896258 -0.7745966692414835 35.0 58000.0 'Yes']]


In [33]:
print("X_test after scaling:\n", X_test)

X_test after scaling:
 [[0.0 1.7320508075688774 -0.7745966692414835 30.0 54000.0 'No']
 [1.0 -0.5773502691896258 -0.7745966692414835 37.0 67000.0 'Yes']]
