# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importing Dataset

In [None]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[: , :-1].values
y = dataset.iloc[: , -1].values

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# HANDLING MISSING DATA

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy= 'mean')
imputer.fit(X[:, 1:3])
X[: , 1:3] = imputer.transform(X[:, 1:3])

In [None]:
print(X)

print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# ENCODING CATEGORICAL DATA

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)


In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# SPLITTING DATASET INTO TRAINING AND TEST SET

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(
    X,y,
    test_size =0.2,
    random_state = 1
)

In [None]:
print(X_train)

[['Spain' nan 52000.0]
 ['Germany' 40.0 nan]
 ['France' 44.0 72000.0]
 ['Spain' 38.0 61000.0]
 ['Spain' 27.0 48000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 35.0 58000.0]]


In [None]:
print(X_test)

[['Germany' 30.0 54000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [None]:
print(y_test)

[0 1]


# FEATURE SCALING

In [None]:
print("Final X_train shape:", X_train.shape)  # Should be (8, 5)
print("First row:", X_train[0])

Final X_train shape: (8, 5)
First row: [0.0 0.0 1.0 -1.483597921805437e-17 -1.088527238171987]


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# After one-hot encoding, numericals move to columns 3 & 4:
# Columns 0-2: One-hot countries
# Column 3: Age
# Column 4: Salary
X_train[:, 3:5] = sc.fit_transform(X_train[:, 3:5])
X_test[:, 3:5] = sc.transform(X_test[:, 3:5])

In [None]:
print(X_train)

[[0.0 0.0 1.0 8.230853543664683e-17 -1.0885272381719873]
 [0.0 1.0 0.0 -0.04159676556125997 -1.2797770728314922e-16]
 [1.0 0.0 0.0 0.5407579522963807 0.6237627994019252]
 [0.0 0.0 1.0 -0.33277412449008037 -0.3179967212637265]
 [0.0 0.0 1.0 -1.9342495985985921 -1.4309852456867695]
 [1.0 0.0 0.0 1.123112670154021 1.2230643125527945]
 [0.0 1.0 0.0 1.4142900290828415 1.565522320067577]
 [1.0 0.0 0.0 -0.7695401628833108 -0.5748402268998133]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 -1.4974835602053618 -0.9172982344145957]
 [1.0 0.0 0.0 -0.47836280395449055 0.19569029000844712]]
