In [131]:
#Data.csv

**Step 1: Importing the libraries**

In [132]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

**Step 2: Importing dataset**

In [133]:
df = pd.read_csv("/content/Data.csv")
df.shape

(10, 4)

In [134]:
df.head(10)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 3: Handling the missing data**

In [135]:
ndf = df.dropna(axis=0)
ndf.reset_index(drop = True)
ndf

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [136]:
ndf.dtypes

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

In [137]:
ndf['Purchased'] = ndf['Purchased'].map({'Yes':1, 'No':0})
ndf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
5,France,35.0,58000.0,1
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


In [138]:
ndf['Country'].value_counts()

France     4
Spain      2
Germany    2
Name: Country, dtype: int64

**Step 5: Creating a dummy variable**

In [139]:
ndf = pd.get_dummies(ndf,columns=['Country'])
ndf

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
5,35.0,58000.0,1,1,0,0
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [140]:
#purchased is our target variable so we will assign it to y.

X = ndf.drop(columns=['Purchased'])
y = ndf['Purchased']
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)
x_train

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
7,48.0,79000.0,1,0,0
9,37.0,67000.0,1,0,0
8,50.0,83000.0,0,1,0
1,27.0,48000.0,0,0,1
5,35.0,58000.0,1,0,0


**Step 7: Feature Scaling**

In [141]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train

array([[ 1.00627821,  0.9247199 ,  0.81649658, -0.5       , -0.5       ],
       [-0.28082183,  0.        ,  0.81649658, -0.5       , -0.5       ],
       [ 1.2402964 ,  1.23295986, -1.22474487,  2.        , -0.5       ],
       [-1.45091277, -1.46413984, -1.22474487, -0.5       ,  2.        ],
       [-0.51484001, -0.69353992,  0.81649658, -0.5       , -0.5       ]])