# Importing the Dataset

In [63]:
#Import library for reading the dataset  
import pandas as pd
import numpy as np

In [64]:
#Import the dataset and store it in a variable named dataset
dataset = pd.read_csv('Data.csv')

In [65]:
#Now we peek at the top of the datset. Notice that missing values are stored as NaN.
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [66]:
dataset.tail()

Unnamed: 0,Country,Age,Salary,Purchased
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes
10,Spain,-18.0,80000.0,No
11,France,-24.0,100000.0,Yes
12,London,24.0,60000.0,No


In [67]:
# or else we can print the whole dataset
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Now we do a general check to see if there are any missing values and then 
count their number

In [68]:
dataset.isnull().any()

Country      False
Age           True
Salary        True
Purchased    False
dtype: bool

In [69]:
#Now this will count the number of missing values in each column
dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

# Handling Missing Data

In [70]:
from sklearn.impute import SimpleImputer

In [71]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [72]:
imputer = imputer.fit(dataset[['Age','Salary']])

In [73]:
dataset[['Age','Salary']] = imputer.transform(dataset[['Age','Salary']])

In [74]:
#Now we check the dataset for the missing values.
dataset.isnull().any()

Country      False
Age          False
Salary       False
Purchased    False
dtype: bool

In [75]:
#Now this will count the number of missing values in each column
dataset.isnull().sum()

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64

In [76]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,67833.333333,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.583333,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


#*Handling noicy value*

In [77]:
#checking ouliers
dataset[(dataset.Age<0)]

Unnamed: 0,Country,Age,Salary,Purchased
10,Spain,-18.0,80000.0,No
11,France,-24.0,100000.0,Yes


In [78]:
# Remove the outliers from the Dataset points
dataset= dataset[~(dataset.Age<0)]

In [79]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,67833.333333,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.583333,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Separating the independent and dependent (i.e. target) features

In [80]:
#Independent Features
X = dataset.iloc[:,0:3].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 67833.33333333333],
       ['France', 35.0, 58000.0],
       ['Spain', 27.583333333333332, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0],
       ['London', 24.0, 60000.0]], dtype=object)

In [81]:
# target
Y = dataset.iloc[:,[3]].values
Y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No']], dtype=object)

# Categorical Encoding

In [82]:
from sklearn.preprocessing import LabelEncoder

In [83]:
#The 0th column of X has non-ordinal or nominal data
X[:,0]

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France', 'London'], dtype=object)

In [84]:
#first label encode 0th column in X
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])
X[:,0]

array([0, 3, 1, 3, 1, 0, 3, 0, 1, 0, 2], dtype=object)

In [85]:
X

array([[0, 44.0, 72000.0],
       [3, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [3, 38.0, 61000.0],
       [1, 40.0, 67833.33333333333],
       [0, 35.0, 58000.0],
       [3, 27.583333333333332, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0],
       [2, 24.0, 60000.0]], dtype=object)

In [86]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [87]:
# again read the data as per the input
X = dataset.iloc[:,0:3].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 67833.33333333333],
       ['France', 35.0, 58000.0],
       ['Spain', 27.583333333333332, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0],
       ['London', 24.0, 60000.0]], dtype=object)

In [88]:
#Now we one-hot encode 0th column of X
onehotencoder = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [0])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
)

In [89]:
# transform the data to get the encoding done
X = onehotencoder.fit_transform(X.tolist())
X

array([[1.0, 0.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 0.0, 40.0, 67833.33333333333],
       [1.0, 0.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 0.0, 1.0, 27.583333333333332, 52000.0],
       [1.0, 0.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 0.0, 24.0, 60000.0]], dtype=object)

In [90]:
# before doing cateforical encoding just analyse the data Y
Y.shape

(11, 1)

In [91]:
# flatten the whole two dimentional list into an single dimentional list
Y.ravel().shape

(11,)

In [92]:
# Now categorical encoding the target variable
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y.ravel())
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0])

#Data Normalization

In [93]:
#Feature scaling refers to putting the values in the same range or same scale so that no variable is dominated by the other.

# Numerical data in the dataset can have a varied range i.e. one parameter may lie between 1 to 10 for all records whereas another parameter can lie between 1000 to 5000. Though data is logically correct but after passing to a particular algorithm, the features with higher magnitude become key parameters for that algorithm.

# To avoid such situations feature scaling is performed using some statistical techniques like Min-Max scaling & Mean normalization. This creates a common range for all the parameters and thus removes Algorithmic bias.

# Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. It is also known as Min-Max scaling.

#normalization on age and salary

X

array([[1.0, 0.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 0.0, 40.0, 67833.33333333333],
       [1.0, 0.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 0.0, 1.0, 27.583333333333332, 52000.0],
       [1.0, 0.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 0.0, 24.0, 60000.0]], dtype=object)

In [94]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

df = dataset[['Salary']]
dataset_minmax = min_max_scaler.fit_transform(df)
pd.DataFrame(dataset_minmax)

Unnamed: 0,0
0,0.685714
1,0.0
2,0.171429
3,0.371429
4,0.566667
5,0.285714
6,0.114286
7,0.885714
8,1.0
9,0.542857


#Data Reduction

**Dimensionality Reduction**

Any Country having less than 3 data points should be tagged as "other" Country. 

This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns

In [96]:
df = dataset
location_stats = df['Country'].value_counts(ascending=False)
location_stats

France     4
Spain      3
Germany    3
London     1
Name: Country, dtype: int64

In [97]:
len(dataset.Country.unique())

4

In [98]:
location_stats_less_than_3 = location_stats[location_stats<3]
location_stats_less_than_3

London    1
Name: Country, dtype: int64