In [140]:
# IMPORTING LIBRARY
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer  # to deal with missing values
from sklearn.preprocessing import LabelEncoder   # used to convert categorical data into numerial values
from sklearn.model_selection import train_test_split  # used to split the data into training set & testing set

In [141]:
# IMPORTING THE DATASET
dataset = pd.read_csv(r"C:\Users\pk161\OneDrive\DATA\Data.csv")
dataset

Unnamed: 0,State,Age,Salary,Purchased
0,Mumbai,44.0,72000.0,No
1,Bangalore,27.0,48000.0,Yes
2,Hyderabad,30.0,54000.0,No
3,Bangalore,38.0,61000.0,No
4,Hyderabad,40.0,,Yes
5,Mumbai,35.0,58000.0,Yes
6,Bangalore,,52000.0,No
7,Mumbai,48.0,79000.0,Yes
8,Hyderabad,50.0,83000.0,No
9,Mumbai,37.0,67000.0,Yes


## SEPARATING THE DATASET AS DEPENDENT & INDIPENDENT

In [143]:
# INDEPENDENT VARIABLE
x = dataset.iloc[:,:-1].values        # (.values) converting pandas dataframe into numpy array
x

array([['Mumbai', 44.0, 72000.0],
       ['Bangalore', 27.0, 48000.0],
       ['Hyderabad', 30.0, 54000.0],
       ['Bangalore', 38.0, 61000.0],
       ['Hyderabad', 40.0, nan],
       ['Mumbai', 35.0, 58000.0],
       ['Bangalore', nan, 52000.0],
       ['Mumbai', 48.0, 79000.0],
       ['Hyderabad', 50.0, 83000.0],
       ['Mumbai', 37.0, 67000.0]], dtype=object)

In [144]:
# DEPENDENT VARIABLE
y = dataset.iloc[:,-1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## DATA CLEANING

### MISSING VALUES IN THE X (Independent variable)

In [147]:
# creating an instance
imputer = SimpleImputer(strategy ='median')             # by default strategy is mean for simpleimputer

In [148]:
# using fit() method to compute necessary statistics to the missing values

imputer = imputer.fit(x[:,1:3])  # fit the imputer to specified columns

# transforing the data to fill missing values 

x[:,1:3] = imputer.transform(x[:,1:3])

In [149]:
x

array([['Mumbai', 44.0, 72000.0],
       ['Bangalore', 27.0, 48000.0],
       ['Hyderabad', 30.0, 54000.0],
       ['Bangalore', 38.0, 61000.0],
       ['Hyderabad', 40.0, 61000.0],
       ['Mumbai', 35.0, 58000.0],
       ['Bangalore', 38.0, 52000.0],
       ['Mumbai', 48.0, 79000.0],
       ['Hyderabad', 50.0, 83000.0],
       ['Mumbai', 37.0, 67000.0]], dtype=object)

## CONVERTING CATEGORICAL DATA INTO NUMERIC

In [151]:
# dealing with independent variable x
# creating instance for the LabelEncoder
labelencoder_x = LabelEncoder()

In [152]:
# fitting and transforming at a time to the specified columns
labelencoder_x.fit_transform(x[:,0])

# assaign the result back to the original data array
x[:,0] = labelencoder_x.fit_transform(x[:,0])

In [153]:
# converted categorical into num
x

array([[2, 44.0, 72000.0],
       [0, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [0, 38.0, 61000.0],
       [1, 40.0, 61000.0],
       [2, 35.0, 58000.0],
       [0, 38.0, 52000.0],
       [2, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [2, 37.0, 67000.0]], dtype=object)

In [154]:
# dealing with dependent variable y
labelencoder_y = LabelEncoder()

# as it is single column/1d array no need to specify index, we can directly assaign
y = labelencoder_y.fit_transform(y)

In [155]:
# converted categorical into numeric array
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## SPLITTNG THE DATA

In [157]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2,random_state = 0)

In [158]:
print("x_train:" , x_train)         # 80% of the data used for the training
print("x_test:" , x_test)          # 20% of the data used for the testing

x_train: [[1 40.0 61000.0]
 [2 37.0 67000.0]
 [0 27.0 48000.0]
 [0 38.0 52000.0]
 [2 48.0 79000.0]
 [0 38.0 61000.0]
 [2 44.0 72000.0]
 [2 35.0 58000.0]]
x_test: [[1 30.0 54000.0]
 [1 50.0 83000.0]]


In [159]:
print("y_train:" , y_train)               # 80% of the data used for the training
print("y_test:" , y_test)                 # 20% of the data used for the testing

y_train: [1 1 1 0 1 0 0 1]
y_test: [0 0]
