## IMPORTING LIBRARIES

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


## READING DATASET

In [2]:
dataset = pd.read_csv(r"C:\Users\pk161\OneDrive\DATA\Data.csv")
dataset

Unnamed: 0,State,Age,Salary,Purchased
0,Mumbai,44.0,72000.0,No
1,Bangalore,27.0,48000.0,Yes
2,Hyderabad,30.0,54000.0,No
3,Bangalore,38.0,61000.0,No
4,Hyderabad,40.0,,Yes
5,Mumbai,35.0,58000.0,Yes
6,Bangalore,,52000.0,No
7,Mumbai,48.0,79000.0,Yes
8,Hyderabad,50.0,83000.0,No
9,Mumbai,37.0,67000.0,Yes


## SEPERATING THE DATASET

In [3]:
# Indipendent Variable
x = dataset.iloc[:,:-1].values
x

array([['Mumbai', 44.0, 72000.0],
       ['Bangalore', 27.0, 48000.0],
       ['Hyderabad', 30.0, 54000.0],
       ['Bangalore', 38.0, 61000.0],
       ['Hyderabad', 40.0, nan],
       ['Mumbai', 35.0, 58000.0],
       ['Bangalore', nan, 52000.0],
       ['Mumbai', 48.0, 79000.0],
       ['Hyderabad', 50.0, 83000.0],
       ['Mumbai', 37.0, 67000.0]], dtype=object)

In [4]:
# Dependent Variable
y = dataset.iloc[:,-1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## DATA CLEANING

In [5]:
# creating an instance
imputer = SimpleImputer(strategy='median')  # by default strategy is mean

In [6]:
# using fit method to compute necessary statistics to the missing values

imputer = imputer.fit(x[:,1:3])    # fit the imputer to specied columns

# transforming data after filling missing values

x[:,1:3] = imputer.transform(x[:,1:3])

In [7]:
x

array([['Mumbai', 44.0, 72000.0],
       ['Bangalore', 27.0, 48000.0],
       ['Hyderabad', 30.0, 54000.0],
       ['Bangalore', 38.0, 61000.0],
       ['Hyderabad', 40.0, 61000.0],
       ['Mumbai', 35.0, 58000.0],
       ['Bangalore', 38.0, 52000.0],
       ['Mumbai', 48.0, 79000.0],
       ['Hyderabad', 50.0, 83000.0],
       ['Mumbai', 37.0, 67000.0]], dtype=object)

## CONVERTING CATEGORICAL DATA INTO NUMERIC

In [9]:
# dealing with x

labelencoder_x = LabelEncoder()

In [11]:
labelencoder_x.fit_transform(x[:,0])

x[:,0] = labelencoder_x.fit_transform(x[:,0])

In [12]:
x

array([[2, 44.0, 72000.0],
       [0, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [0, 38.0, 61000.0],
       [1, 40.0, 61000.0],
       [2, 35.0, 58000.0],
       [0, 38.0, 52000.0],
       [2, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [2, 37.0, 67000.0]], dtype=object)

In [13]:
# dealing with y

labelencoder_y = LabelEncoder()

In [14]:
# as it is a single column/1D array no need to specify index, we can directly assaign

y = labelencoder_y.fit_transform(y)

In [15]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## DATA SPLITTING

In [18]:
# splitting the data into training & testing
# 80% of the data used for training and 20% of the data used for testing

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=0)

In [19]:
print(x_train)

[[1 40.0 61000.0]
 [2 37.0 67000.0]
 [0 27.0 48000.0]
 [0 38.0 52000.0]
 [2 48.0 79000.0]
 [0 38.0 61000.0]
 [2 44.0 72000.0]
 [2 35.0 58000.0]]


In [20]:
print(x_test)

[[1 30.0 54000.0]
 [1 50.0 83000.0]]


In [21]:
print(y_train)

[1 1 1 0 1 0 0 1]


In [22]:
print(y_test)

[0 0]
