In [None]:
#Data.csv

In [None]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Step 1: Importing the libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import category_encoders as ce

**Step 2: Importing dataset**

In [None]:
df = pd.read_csv('Data.csv')

**Step 3: Handling the missing data**

In [None]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [None]:
df.fillna(df.mean(), inplace=True)

In [None]:
df

Unnamed: 0,Age,Salary,Purchased,Con_1,Con_2
0,44.0,72000.0,0,0,0
1,27.0,48000.0,1,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,63777.777778,1,1,0
5,35.0,58000.0,1,0,0
6,38.777778,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


In [None]:
df.shape

(10, 4)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [None]:
df.isnull()

Unnamed: 0,Age,Salary,Purchased,Con_1,Con_2
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


**Step 4: Encoding categorical data**

In [None]:
df

Unnamed: 0,Age,Salary,Purchased,Con_1,Con_2
0,44.0,72000.0,0,0,0
1,27.0,48000.0,1,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,63777.777778,1,1,0
5,35.0,58000.0,1,0,0
6,38.777778,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


In [None]:
df['Purchased'] = df['Purchased'].astype('category')
df['Purchased'] = df['Purchased'].cat.codes
print(df)

         Age        Salary  Purchased  Con_1  Con_2
0  44.000000  72000.000000          0      0      0
1  27.000000  48000.000000          1      0      1
2  30.000000  54000.000000          0      1      0
3  38.000000  61000.000000          0      0      1
4  40.000000  63777.777778          1      1      0
5  35.000000  58000.000000          1      0      0
6  38.777778  52000.000000          0      0      1
7  48.000000  79000.000000          1      0      0
8  50.000000  83000.000000          0      1      0
9  37.000000  67000.000000          1      0      0


**Step 5: Creating a dummy variable**

In [None]:
df = pd.get_dummies(df, prefix=['Con'],drop_first=True, columns=['Country'])

In [None]:
df

Unnamed: 0,Age,Salary,Purchased,Con_1,Con_2
0,44.0,72000.0,0,0,0
1,27.0,48000.0,1,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,63777.777778,1,1,0
5,35.0,58000.0,1,0,0
6,38.777778,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [None]:
X = df.loc[:,['Age','Salary']].values
y = df.loc[:,'Purchased'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
X_train,X_test

(array([[3.87777778e+01, 5.20000000e+04],
        [4.40000000e+01, 7.20000000e+04],
        [3.70000000e+01, 6.70000000e+04],
        [4.80000000e+01, 7.90000000e+04],
        [3.00000000e+01, 5.40000000e+04],
        [2.70000000e+01, 4.80000000e+04],
        [3.50000000e+01, 5.80000000e+04]]),
 array([[4.00000000e+01, 6.37777778e+04],
        [5.00000000e+01, 8.30000000e+04],
        [3.80000000e+01, 6.10000000e+04]]))

In [None]:
y_train,y_test

(array([0, 0, 1, 1, 0, 1, 1], dtype=int8), array([1, 0, 0], dtype=int8))

In [None]:
X_train.shape,X_test.shape

((7, 2), (3, 2))

**Step 7: Feature Scaling**

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
df.describe()

Unnamed: 0,Age,Salary,Purchased,Con_1,Con_2
count,10.0,10.0,10.0,10.0,10.0
mean,38.777778,63777.777778,0.5,0.3,0.3
std,7.253777,11564.099406,0.527046,0.483046,0.483046
min,27.0,48000.0,0.0,0.0,0.0
25%,35.5,55000.0,0.0,0.0,0.0
50%,38.388889,62388.888889,0.5,0.0,0.0
75%,43.0,70750.0,1.0,0.75,0.75
max,50.0,83000.0,1.0,1.0,1.0


In [None]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression()
regressor.fit(X_train,y_train)

LogisticRegression()

In [None]:
X_train,X_test

(array([[ 0.24419661, -0.88881332],
        [ 1.009346  ,  0.99654827],
        [-0.01627977,  0.52520787],
        [ 1.59541787,  1.65642482],
        [-1.04190555, -0.70027716],
        [-1.48145945, -1.26588564],
        [-0.30931571, -0.32320484]]), array([[4.00000000e+01, 6.37777778e+04],
        [5.00000000e+01, 8.30000000e+04],
        [3.80000000e+01, 6.10000000e+04]]))

In [None]:
y_train,y_test

(array([0, 0, 1, 1, 0, 1, 1], dtype=int8), array([1, 0, 0], dtype=int8))

In [None]:
print(regressor.intercept_, regressor.coef_)

[0.28957757] [[-0.28547033  0.37619867]]


In [None]:
y_pred=regressor.predict(X_test)
y_pred

array([1, 1, 1], dtype=int8)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

In [None]:
print(regressor.intercept_, regressor.coef_) 

0.5714285714285714 [-0.44655668  0.47557159]


In [None]:
y_pred=regressor.predict(X_test)
y_pred

array([30313.60853046, 39450.68580487, 28993.46944134])