In [102]:
#Data.csv

**Step 1: Importing the libraries**

In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

**Step 2: Importing dataset**

In [104]:
data = pd.read_csv("Data.csv")
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


**Step 3: Handling the missing data**

In [105]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [106]:
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [107]:
data.dropna(inplace=True)
print(data)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
5   France  35.0  58000.0       Yes
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


**Step 4: Encoding categorical data**

In [108]:

enc = OrdinalEncoder()
data['Purchased']=enc.fit_transform(data[['Purchased']])
data['Purchased'] = data['Purchased'].astype(int)


**Step 5: Creating a dummy variable**

In [109]:
# Dummy variable creation is not needed for this dataset
print(data['Country'].value_counts())
one_hot_encoded_data = pd.get_dummies(data, columns = ['Country'],drop_first=True)

#one_hot_encoded_data=one_hot_encoded_data.drop("")
one_hot_encoded_data.insert(len(one_hot_encoded_data.columns)-1, 'Purchased', one_hot_encoded_data.pop('Purchased'))
print(one_hot_encoded_data)

France     4
Germany    2
Spain      2
Name: Country, dtype: int64
    Age   Salary  Country_Germany  Country_Spain  Purchased
0  44.0  72000.0                0              0          0
1  27.0  48000.0                0              1          1
2  30.0  54000.0                1              0          0
3  38.0  61000.0                0              1          0
5  35.0  58000.0                0              0          1
7  48.0  79000.0                0              0          1
8  50.0  83000.0                1              0          0
9  37.0  67000.0                0              0          1


**Step 6: Splitting the datasets into training sets and Test sets**

In [110]:
X=one_hot_encoded_data.iloc[:,:-1]
print(X)
y=one_hot_encoded_data.iloc[:,-1]
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

    Age   Salary  Country_Germany  Country_Spain
0  44.0  72000.0                0              0
1  27.0  48000.0                0              1
2  30.0  54000.0                1              0
3  38.0  61000.0                0              1
5  35.0  58000.0                0              0
7  48.0  79000.0                0              0
8  50.0  83000.0                1              0
9  37.0  67000.0                0              0
0    0
1    1
2    0
3    0
5    1
7    1
8    0
9    1
Name: Purchased, dtype: int32


**Step 7: Feature Scaling**

In [114]:

scaler = MinMaxScaler()

X_train[['Age','Salary']]=scaler.fit_transform(X_train[['Age','Salary']])
X_test[['Age','Salary']]=scaler.transform(X_test[['Age','Salary']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[['Age','Salary']]=scaler.fit_transform(X_train[['Age','Salary']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[['Age','Salary']]=scaler.transform(X_test[['Age','Salary']])
A va

In [112]:
print(X_train)

        Age    Salary  Country_Germany  Country_Spain
3  0.478261  0.371429                0              1
8  1.000000  1.000000                1              0
1  0.000000  0.000000                0              1
9  0.434783  0.542857                0              0
0  0.739130  0.685714                0              0


In [113]:
print(X_test)

        Age  Salary  Country_Germany  Country_Spain
5  0.277778    0.16                0              0
7  1.000000    1.00                0              0
2  0.000000    0.00                1              0
