# Data proprocessing - Data preprocessing in machine learning is a technique of preparing data
(Cleaning and organizing) to make it easier to built machine learning models.

# 1. Importing libraries -  

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as pld
import seaborn as sns

# 2. Importing the dataset - 

In [2]:
df = pd.read_csv("Data2.csv")

In [3]:
df.head(7)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No


**Here Country, Age, Salary is Independent variable and PUrchased is Dependent variable because it's value is depending on independent variable.**

# 3. Handling the missing values - 

<img src="https://lh3.googleusercontent.com/proxy/6qg6-KlwXQ56_rViLvRkhRvVZJ9iQ5yOuNmyoI-CNR-_WwrDOpX6YzM73izXWMnM5WB705TdCylUy9vyvVOJyCoLYnfn6aYrg5rP5q9XduSHQT0piefVRy1ooTj3R5KYx1A" alt="Variables" width="600" height="300">

In [4]:
x = df[["Country","Age","Salary"]].values
y = df[["Purchased"]].values

In [5]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

In [7]:
x = df.fillna(df.mean())

  x = df.fillna(df.mean())


**There are two ways to handling missing values  - 
1. Remove all the rows and columns which is contains null values.
2. Fill the mean and other values on that particular row and columns.**

In [8]:
df.head(10)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [9]:
x

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# 4. Encoding categorical data - 

**The meaning of Encoding categorial data is convert the label string data into the numeric data**

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
y = df["Purchased"] = LabelEncoder().fit_transform(df["Purchased"])

In [12]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,,1


In [13]:
label_encoder2 = LabelEncoder()
y = df["Country"] = label_encoder2.fit_transform(df["Country"])

In [14]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,,1


**We will dommy encoding here according to our condition** 

In [15]:
x = pd.get_dummies(df,columns = ["Country"],prefix = "Country")

In [16]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,,1


In [17]:
x

Unnamed: 0,Age,Salary,Purchased,Country_0,Country_1,Country_2
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


In [18]:
y

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

# 5. train_test_split data - 

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Assuming x and y are your feature and target variables
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [20]:
x_train

Unnamed: 0,Age,Salary,Purchased,Country_0,Country_1,Country_2
4,40.0,,1,0,1,0
9,37.0,67000.0,1,1,0,0
1,27.0,48000.0,1,0,0,1
6,,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
3,38.0,61000.0,0,0,0,1
0,44.0,72000.0,0,1,0,0
5,35.0,58000.0,1,1,0,0


In [21]:
x_test

Unnamed: 0,Age,Salary,Purchased,Country_0,Country_1,Country_2
2,30.0,54000.0,0,0,1,0
8,50.0,83000.0,0,0,1,0


In [22]:
y_train

array([1, 0, 2, 2, 0, 2, 0, 0])

In [24]:
y_test

array([1, 1])

# 6. Feature scallig - 

In [26]:
from sklearn.preprocessing import StandardScaler

In [28]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)

In [29]:
x_train

array([[ 0.25315802,         nan,  0.77459667, -1.        ,  2.64575131,
        -0.77459667],
       [-0.23014365,  0.44897083,  0.77459667,  1.        , -0.37796447,
        -0.77459667],
       [-1.84114924, -1.41706417,  0.77459667, -1.        , -0.37796447,
         1.29099445],
       [        nan, -1.0242147 , -1.29099445, -1.        , -0.37796447,
         1.29099445],
       [ 1.54196248,  1.62751925,  0.77459667,  1.        , -0.37796447,
        -0.77459667],
       [-0.0690431 , -0.14030338, -1.29099445, -1.        , -0.37796447,
         1.29099445],
       [ 0.89756025,  0.94003267, -1.29099445,  1.        , -0.37796447,
        -0.77459667],
       [-0.55234477, -0.43494049,  0.77459667,  1.        , -0.37796447,
        -0.77459667]])

In [30]:
x_test = sc.transform(x_test)

In [31]:
x_test

array([[-1.35784756, -0.82778996, -1.29099445, -1.        ,  2.64575131,
        -0.77459667],
       [ 1.8641636 ,  2.02036872, -1.29099445, -1.        ,  2.64575131,
        -0.77459667]])