# Polynomial Regression
What is a polymial regression??  
Answer here: 

In [79]:
import numpy as np
import matplotlib.pyplot as plt # for plotting, for visualizations
import pandas as pd # for data manipulation and analysis

## Importing the dataset

In [80]:
dataset = pd.read_csv('Data - Copy.csv')

#dataset.iloc[row_index, column_index]
X = dataset.iloc[:, :-1].values
#iloc[:, :-1].values means that we are taking all the rows and all the columns except the last one

Y = dataset.iloc[:, 1].values
#[:, 1] means that we are taking all the rows and the second column

In [81]:
print(dataset)
print(type(X))

    Country   Age   Salary PurchasedProduct
0   Finland  42.0  45000.0              yes
1   Finland  45.0  39000.0              yes
2   Finland  30.0  30000.0              yes
3   Finland  34.0  41000.0              yes
4    Sweden  49.0  48000.0              yes
5    Sweden  40.0  38000.0              yes
6    Sweden  30.0  31000.0               no
7    Sweden  35.0  35000.0               no
8    Sweden  31.0  40000.0              yes
9    Sweden  35.0  37000.0               no
10   Sweden  36.0  35000.0               no
11   Sweden  36.0      NaN               no
12   Norway  45.0  46000.0              yes
13   Norway  39.0  33000.0               no
14   Norway  33.0  31000.0               no
15   Norway  32.0  42000.0              yes
16   Norway  37.0  35000.0               no
17   Norway  34.0  36000.0               no
18   Norway   NaN  39000.0               no
<class 'numpy.ndarray'>


In [82]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #missing_values=np.nan means that we are replacing the missing values with NaN
imputer = imputer.fit(X[:, 1:3]) #fitting the imputer object to the matrix X
X[:, 1:3] = imputer.transform(X[:, 1:3]) #transforming the matrix X

In [83]:
print(X)
print(type(X))

[['Finland' 42.0 45000.0]
 ['Finland' 45.0 39000.0]
 ['Finland' 30.0 30000.0]
 ['Finland' 34.0 41000.0]
 ['Sweden' 49.0 48000.0]
 ['Sweden' 40.0 38000.0]
 ['Sweden' 30.0 31000.0]
 ['Sweden' 35.0 35000.0]
 ['Sweden' 31.0 40000.0]
 ['Sweden' 35.0 37000.0]
 ['Sweden' 36.0 35000.0]
 ['Sweden' 36.0 37833.333333333336]
 ['Norway' 45.0 46000.0]
 ['Norway' 39.0 33000.0]
 ['Norway' 33.0 31000.0]
 ['Norway' 32.0 42000.0]
 ['Norway' 37.0 35000.0]
 ['Norway' 34.0 36000.0]
 ['Norway' 36.833333333333336 39000.0]]
<class 'numpy.ndarray'>


## Data preparation

### Encoding the independent variable (X)

In [84]:
from sklearn.compose import ColumnTransformer #ColumnTransformer is a class that helps in transforming the columns
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') 
# transformer means that we are transforming the data, 
# encoding mean that we are converting the data into a form that can be used by the machine learning model
# OneHotEncoder is a class that helps in encoding the data
# [0] means that we are encoding the first column, 
# remainder='passthrough' means that we are keeping the remaining columns as they are

X = np.array(ct.fit_transform(X))
#fit_transform means that we are fitting the data and transforming it

In [85]:
print(X)

[[1.0 0.0 0.0 42.0 45000.0]
 [1.0 0.0 0.0 45.0 39000.0]
 [1.0 0.0 0.0 30.0 30000.0]
 [1.0 0.0 0.0 34.0 41000.0]
 [0.0 0.0 1.0 49.0 48000.0]
 [0.0 0.0 1.0 40.0 38000.0]
 [0.0 0.0 1.0 30.0 31000.0]
 [0.0 0.0 1.0 35.0 35000.0]
 [0.0 0.0 1.0 31.0 40000.0]
 [0.0 0.0 1.0 35.0 37000.0]
 [0.0 0.0 1.0 36.0 35000.0]
 [0.0 0.0 1.0 36.0 37833.333333333336]
 [0.0 1.0 0.0 45.0 46000.0]
 [0.0 1.0 0.0 39.0 33000.0]
 [0.0 1.0 0.0 33.0 31000.0]
 [0.0 1.0 0.0 32.0 42000.0]
 [0.0 1.0 0.0 37.0 35000.0]
 [0.0 1.0 0.0 34.0 36000.0]
 [0.0 1.0 0.0 36.833333333333336 39000.0]]


### Encoding the dependent variable (Y)

In [86]:
from sklearn.preprocessing import LabelEncoder #LabelEncoder is a class that helps in encoding the labels
le = LabelEncoder() 
Y = le.fit_transform(Y) #fit_transform means that we are fitting the data and transforming it

In [87]:
print(Y)

[10 11  0  4 12  9  0  5  1  5  6  6 11  8  3  2  7  4 13]


## Train & Test the model

In [88]:
from sklearn.model_selection import train_test_split #train_test_split is a function that helps in splitting the data into training and testing sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
#create the training and testing sets
#random state = 1 means that the data is split in a random manner
#test_size = 0.2 means that 20% of the data is used for testing and 80% of the data is used for training

In [89]:
print(X_train)

[[1.0 0.0 0.0 30.0 30000.0]
 [0.0 1.0 0.0 33.0 31000.0]
 [0.0 0.0 1.0 49.0 48000.0]
 [0.0 1.0 0.0 37.0 35000.0]
 [0.0 0.0 1.0 35.0 35000.0]
 [1.0 0.0 0.0 45.0 39000.0]
 [0.0 1.0 0.0 39.0 33000.0]
 [1.0 0.0 0.0 42.0 45000.0]
 [0.0 1.0 0.0 36.833333333333336 39000.0]
 [0.0 1.0 0.0 34.0 36000.0]
 [0.0 0.0 1.0 35.0 37000.0]
 [0.0 0.0 1.0 31.0 40000.0]
 [0.0 1.0 0.0 45.0 46000.0]
 [0.0 0.0 1.0 36.0 37833.333333333336]
 [0.0 0.0 1.0 40.0 38000.0]]
