In [24]:
import numpy as np
import pandas as pd

In [2]:
X_train = np.array([[1000.0, 2.0], [1500.0, 3.0]])

In [3]:
X_train

array([[1000.,    2.],
       [1500.,    3.]])

In [4]:
# grab all rows for the first column
X_train[:, 0]

array([1000., 1500.])

In [5]:
# grab all rows for the second column
X_train[:, 1]

array([2., 3.])

In [9]:
# minimum of first column
X_train[:, 0].min()
# or 
# np.min(X_train[:, 0])

1000.0

In [13]:
# average value of first column
X_train[:, 0].mean()
# or 
# np.mean(X_train[:, 0])

1250.0

In [14]:
# doing the same for the second column
np.mean(X_train[:, 1])

2.5

In [15]:
np.min(X_train[:, 1]) # all rows, in column 1 

2.0

In [17]:
# Activity for Data Scaling: MinMaz Scaler
def max_min_s(X):
    return (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

max_min_s(X_train)

array([[0., 0.],
       [1., 1.]])

In [18]:
# Let's use the Sklearn preprocessing package to do the same thing as our above function:

from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X_train)
print(X_minmax)

[[0. 0.]
 [1. 1.]]


In [22]:
def standard_s(X):
    '''A two dimensional array (Matrix) X is given, write a function that for each column:

Obtain the mean for the column (mean)
Obtain the standard deviation for the column (std)
For each value in the column, calculate the following: (value - mean)/std

    return (X - X.mean(axis=0)) /X.std(axis=0) 

print(standard_s(X_train))
'''
    cols = len(X[0])
    for i in range(cols):
        col = X[:, i]
        mean = col.mean()
        std_dev = col.std()
        for j in range(len(col)):
            val = col[j]
            col[j] = (val - mean)/std_dev       
    return X
standard_s(X_train)
            

array([[-1., -1.],
       [ 1.,  1.]])

In [23]:
# Lets use Sklearn preprecoessing package to do the same thing:
standard_scaler = preprocessing.StandardScaler()
X_ss = standard_scaler.fit_transform(X_train)
print(X_ss)

[[-1. -1.]
 [ 1.  1.]]


In [25]:
churn = pd.read_csv('Datasets/Churn_Modelling.csv')

In [26]:
churn.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [34]:
churn['Geography'].unique

<bound method Series.unique of 0        France
1         Spain
2        France
3        France
4         Spain
         ...   
9995     France
9996     France
9997     France
9998    Germany
9999     France
Name: Geography, Length: 10000, dtype: object>

In [35]:
churn['Gender'].nunique()

2

In [36]:
churn['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [38]:
# Label Encoding
# feature matrix is X, it has the data samples, for the columns we actually care about
X = churn.iloc[:, 3:13].values
# target is the y column, the column for the value we are predicting (whether they churned or not)
y = churn.iloc[:, 13].values

In [41]:
# print first 10 rows of feature matrix
print(X[0:10, :])

[[619 'France' 'Female' 42 2 0.0 1 1 1 101348.88]
 [608 'Spain' 'Female' 41 1 83807.86 1 0 1 112542.58]
 [502 'France' 'Female' 42 8 159660.8 3 1 0 113931.57]
 [699 'France' 'Female' 39 1 0.0 2 0 0 93826.63]
 [850 'Spain' 'Female' 43 2 125510.82 1 1 1 79084.1]
 [645 'Spain' 'Male' 44 8 113755.78 2 1 0 149756.71]
 [822 'France' 'Male' 50 7 0.0 2 1 1 10062.8]
 [376 'Germany' 'Female' 29 4 115046.74 4 1 0 119346.88]
 [501 'France' 'Male' 44 4 142051.07 2 0 1 74940.5]
 [684 'France' 'Male' 27 2 134603.88 1 1 1 71725.73]]


In [42]:
# Apply Label Encoder to the second and third columns

from sklearn.preprocessing import LabelEncoder

label_encoder_X_1 = LabelEncoder()
X[:, 1] = label_encoder_X_1.fit_transform(X[:, 1])
label_encoder_X_2 = LabelEncoder()
X[:, 2] = label_encoder_X_2.fit_transform(X[:, 2])
print(X[0:10,:])
print(X.shape)

[[619 0 0 42 2 0.0 1 1 1 101348.88]
 [608 2 0 41 1 83807.86 1 0 1 112542.58]
 [502 0 0 42 8 159660.8 3 1 0 113931.57]
 [699 0 0 39 1 0.0 2 0 0 93826.63]
 [850 2 0 43 2 125510.82 1 1 1 79084.1]
 [645 2 1 44 8 113755.78 2 1 0 149756.71]
 [822 0 1 50 7 0.0 2 1 1 10062.8]
 [376 1 0 29 4 115046.74 4 1 0 119346.88]
 [501 0 1 44 4 142051.07 2 0 1 74940.5]
 [684 0 1 27 2 134603.88 1 1 1 71725.73]]
(10000, 10)


In [44]:
# We can do Label encoding and one-hot encoding at the same time in Pandas¶
X = churn.iloc[:, 3:13]
y = churn.iloc[:, 13]
pd.get_dummies(X).head(10)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,1,0
3,699,39,1,0.0,2,0,0,93826.63,1,0,0,1,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,1,0
5,645,44,8,113755.78,2,1,0,149756.71,0,0,1,0,1
6,822,50,7,0.0,2,1,1,10062.8,1,0,0,0,1
7,376,29,4,115046.74,4,1,0,119346.88,0,1,0,1,0
8,501,44,4,142051.07,2,0,1,74940.5,1,0,0,0,1
9,684,27,2,134603.88,1,1,1,71725.73,1,0,0,0,1


In [46]:

from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder()
X = one_hot_encoder.fit_transform(X).toarray()
print(pd.DataFrame(X[0:10,:]))

   0      1      2      3      4      5      6      7      8      9      ...  \
0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
1    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
2    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
3    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
4    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
5    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
6    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
7    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0  ...   
8    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
9    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   

   16925  16926  16927  16928  16929  16930  16931  16932  16933  16934  
0    0.0    0.0    0.0    0.0    0.0    0.0  