In [1]:
# importing libraries
import numpy as np
import pandas as pd

In [2]:
# importing dataset
dataset = pd.read_csv('data_files/day_1_data_preprocessing_Data.csv')

In [3]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:
dataset.iloc[:,:-1]  # in [select_row, select_column]

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [5]:
dataset.iloc[:,:-1].values # converted dataframe to numpy array

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
X = dataset.iloc[:,:-1].values # currently use to_numpy()
type(X)

numpy.ndarray

In [7]:
Y = dataset.iloc[:,-1].to_numpy() # also we can use -1 or 3, last column number is 3

In [8]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [9]:
# Handling the missing data
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # the simpleimputer works column wise, so no need to give axis=0, takes mean from column wise and replace nan with that mean value
X[:, 1:3] = imputer.fit_transform(X[:, 1:3]) # replace mean value in first and second column

In [10]:
# Encoding categorical data 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Encode first column (Country) using OneHotEncoder
ct = ColumnTransformer(
    transformers=[("encoder", OneHotEncoder(), [0])],
    remainder="passthrough"
)

X = ct.fit_transform(X)

# Encode target (Yes/No)
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)


In [11]:
X # if the order is France, Germany, Spain, it will change france to [1,0,0] instead of 0,1,2. since the model will think like 0 < 1 < 2 , (keeping age and salary untouched).

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [12]:
Y # change Yes to 1 and No to 0

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [13]:
# splitting the dataset into training sets and test test 
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0
)


In [14]:
# feature scaling - If the model measures distance or moves with gradients, scale it; if it just chops data into branches, skip it.
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)  # fit on train
X_test = sc_X.transform(X_test)        # transform test using same scaler
