In [1]:
#Data.csv

**Step 1: Importing the libraries**

In [2]:
import pandas as pd 
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.impute import SimpleImputer

**Step 2: Importing dataset**

In [4]:
df = pd.read_csv("Data_1.csv")

In [5]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 3: Handling the missing data**

* the following dataframe seems to have no outliers 
* hence mean imputation seems valid choice

In [6]:
print(df["Age"].mean())
print(df["Salary"].mean())

38.77777777777778
63777.77777777778


### normal method of filling missing values in dataframe

In [7]:
df.fillna({"Age":round(df["Age"].mean(),1),
           "Salary":round(df["Salary"].mean(),1)},inplace=True)

In [8]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.8,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.8,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### using sklearn SimpleImputer technique

In [9]:
# mean_impute = SimpleImputer(strategy="mean")
# df["Age"] = mean_impute.fit_transform(df[["Age"]])
# df["Salary"]=mean_impute.fit_transform(df[["Salary"]])

In [10]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.8,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.8,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### lets try with Knn imputer --> neighbours of all elements in dataframe results in mean imputation

In [11]:
# num = [i for i in df.columns if df[i].dtypes != "O"]
# num

In [12]:
# from sklearn.impute import KNNImputer
# mean_knn = KNNImputer(n_neighbors=9)
# df1 = pd.DataFrame(mean_knn.fit_transform(df[num]),columns=["Age","Salary"])

In [13]:
# df1

In [14]:
# df_clear = df.drop(["Age","Salary"],axis=1)

In [15]:
# df_clear.insert(1,"Age",df1["Age"])
# df_clear.insert(2,"Salary",df1["Salary"])

**Step 4: Encoding categorical data**

In [16]:
cat=[col for col in df.columns if df[col].dtypes =="O"]
cat

['Country', 'Purchased']

* encoding the data using one hot encoder method

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [18]:
one_cat = OneHotEncoder(sparse=False)
one_cat.fit_transform(df[cat])
print(one_cat.get_feature_names_out())

['Country_France' 'Country_Germany' 'Country_Spain' 'Purchased_No'
 'Purchased_Yes']


In [19]:
df_encoded = pd.DataFrame(one_cat.fit_transform(df[cat]),columns=one_cat.get_feature_names_out())

In [20]:
df_1encoded = pd.concat([df.drop(["Country","Purchased"],axis=1),df_encoded],axis=1)

**Step 5: Creating a dummy variable**

using get_dummies

In [21]:
df_d = pd.get_dummies(data=df,drop_first=True)

In [22]:
df_d

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain,Purchased_Yes
0,44.0,72000.0,0,0,0
1,27.0,48000.0,0,1,1
2,30.0,54000.0,1,0,0
3,38.0,61000.0,0,1,0
4,40.0,63777.8,1,0,1
5,35.0,58000.0,0,0,1
6,38.8,52000.0,0,1,0
7,48.0,79000.0,0,0,1
8,50.0,83000.0,1,0,0
9,37.0,67000.0,0,0,1


In [23]:
x = df_d.drop("Salary",axis=1)
y = df_d["Salary"]

**Step 6: Splitting the datasets into training sets and Test sets**

In [24]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [25]:
x_train

Unnamed: 0,Age,Country_Germany,Country_Spain,Purchased_Yes
6,38.8,0,1,0
3,38.0,0,1,0
9,37.0,0,0,1
0,44.0,0,0,0
4,40.0,1,0,1
7,48.0,0,0,1
5,35.0,0,0,1
2,30.0,1,0,0


In [26]:
x_test

Unnamed: 0,Age,Country_Germany,Country_Spain,Purchased_Yes
1,27.0,0,1,1
8,50.0,1,0,0


**Step 7: Feature Scaling**

In [27]:
# age column can be feature scaled

In [28]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(df[["Age"]])

In [29]:
df_ss = pd.concat([pd.DataFrame(scale.transform(df[["Age"]]),
                        columns=scale.get_feature_names_out()),df_d]
          ,axis=1)
df_ss

Unnamed: 0,Age,Age.1,Salary,Country_Germany,Country_Spain,Purchased_Yes
0,0.758551,44.0,72000.0,0,0,0
1,-1.711826,27.0,48000.0,0,1,1
2,-1.275877,30.0,54000.0,1,0,0
3,-0.113347,38.0,61000.0,0,1,0
4,0.177286,40.0,63777.8,1,0,1
5,-0.549296,35.0,58000.0,0,0,1
6,0.002906,38.8,52000.0,0,1,0
7,1.339816,48.0,79000.0,0,0,1
8,1.630449,50.0,83000.0,1,0,0
9,-0.258663,37.0,67000.0,0,0,1


In [30]:
df_ss.columns=['Age', 'Age1', 'Salary', 'Country_Germany', 'Country_Spain',
       'Purchased_Yes']

In [31]:
df_ss.drop("Age1",axis=1,inplace=True)

In [32]:
df_ss

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain,Purchased_Yes
0,0.758551,72000.0,0,0,0
1,-1.711826,48000.0,0,1,1
2,-1.275877,54000.0,1,0,0
3,-0.113347,61000.0,0,1,0
4,0.177286,63777.8,1,0,1
5,-0.549296,58000.0,0,0,1
6,0.002906,52000.0,0,1,0
7,1.339816,79000.0,0,0,1
8,1.630449,83000.0,1,0,0
9,-0.258663,67000.0,0,0,1
