In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

**Step 2: Importing dataset**

In [3]:
dataset = pd.read_csv('Data.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 3: Handling the missing data**

In [4]:
#checking for null values
dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [5]:
# Fill the Salary column's NaN values with average value of the column 'Salary' .
dataset['Salary'].fillna(dataset['Salary'].mean(),inplace=True)


In [7]:
# Fill the Age column's  NaN values with average value of the column 'Age'.
dataset['Age'].fillna(dataset['Age'].mean(),inplace=True)
dataset['Age'] = dataset['Age'].astype(int)

In [9]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000.0,No
1,Spain,27,48000.0,Yes
2,Germany,30,54000.0,No
3,Spain,38,61000.0,No
4,Germany,40,63777.777778,Yes
5,France,35,58000.0,Yes
6,Spain,38,52000.0,No
7,France,48,79000.0,Yes
8,Germany,50,83000.0,No
9,France,37,67000.0,Yes


**Step 4: Encoding categorical data**

In [10]:
# Label Encoding Purchased data
label_encoder = LabelEncoder()
dataset['Purchased']= label_encoder.fit_transform(dataset['Purchased'])

**Step 5: Creating a dummy variable**

In [11]:
dataset = pd.get_dummies(dataset, columns=['Country'])
dataset

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44,72000.0,0,1,0,0
1,27,48000.0,1,0,0,1
2,30,54000.0,0,0,1,0
3,38,61000.0,0,0,0,1
4,40,63777.777778,1,0,1,0
5,35,58000.0,1,1,0,0
6,38,52000.0,0,0,0,1
7,48,79000.0,1,1,0,0
8,50,83000.0,0,0,1,0
9,37,67000.0,1,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [12]:
X = dataset.drop(['Purchased'],axis=1)
Y = dataset['Purchased']

In [13]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3)

**Step 7: Feature Scaling**

In [14]:
s_scale = StandardScaler()
x_train = s_scale.fit_transform(x_train)
x_test = s_scale.transform(x_test)

In [15]:
x_train

array([[ 0.14698618, -0.17298898, -1.15470054, -0.40824829,  1.58113883],
       [-0.29397237, -0.47571969,  0.8660254 , -0.40824829, -0.63245553],
       [ 1.02890329,  0.93702364,  0.8660254 , -0.40824829, -0.63245553],
       [-1.46986184, -1.48482207, -1.15470054, -0.40824829,  1.58113883],
       [ 0.        ,  0.43247245,  0.8660254 , -0.40824829, -0.63245553],
       [ 1.61684802,  1.64339531,  0.8660254 , -0.40824829, -0.63245553],
       [-1.02890329, -0.87936065, -1.15470054,  2.44948974, -0.63245553]])

In [16]:
x_test

array([[ 0.14698618, -1.08118112, -1.15470054, -0.40824829,  1.58113883],
       [ 0.44095855,  0.10731724, -1.15470054,  2.44948974, -0.63245553],
       [ 1.91082039,  2.04703626, -1.15470054,  2.44948974, -0.63245553]])