<a href="https://colab.research.google.com/github/RakeshBB08/MachineLearning/blob/master/MachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---



# **DATA PREPROCESSING TOOLS:**


---





1) Importing the libraries

---



In [None]:
import numpy as np
# allows us work with array data
import matplotlib.pyplot as plt
#  to plot the charts 
import pandas as pd
# work with dataset

Importing the Dataset

---



In [None]:
dataset = pd.read_csv('Data.csv')
# iloc used for locating the indexes [rows_start:rows_end,columns_start:columns_end]
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Taking care of missing data

---



In [None]:
from numpy.core.fromnumeric import mean
# 1) ignore the sample dataset
# 2) replacing the missing data by the avg of all the values of the column
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean' ) # select the imputer method first 
imputer.fit(X[:,1:3]) # excludes final vlaue in range, after we model for the required columns
X[:,1:3] = imputer.transform(X[:,1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# **Encoding categorical data**

---



Encoding the independent variable

---



In [None]:
# one hot encoding:
# One-hot encoding in machine learning is the conversion of categorical information into a format that may be fed into machine learning algorithms to improve prediction accuracy
# Used because to not to obtain any correlation while calculation for this column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers= [('encoder',OneHotEncoder(),[0])],remainder='passthrough') # OneHotEncoder is class function call observe
X = np.array(ct.fit_transform(X)) # fit_transform dosent return numpy arrary so we need to specify explicitly
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

Encoding the Dependent variable

---



In [None]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Splitting the dataset into the Training set and Test set

---



In [None]:
from sklearn.model_selection import train_test_split
# creates a 4 separate set,Xtrain,Xtest,Ytrain,Ytest
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0) #random stae denotes the seeding of the splits


In [None]:
X_train

array([[0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [None]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0]], dtype=object)

In [None]:
y_train

array([1, 1, 1, 0, 1, 0, 0, 1])

In [None]:
y_test

array([0, 0])

# Feature Scaling

---



In [None]:
# we need to apply feature scaling after splitting the data set 
# feature scaling uses mean and other properties while tesing the model it should not be used in the training 
# if we use feature scaling before it may cause info leakage
# standardisation values b/w -3 to 3 whereas in normalisation values b/w -1 and 1
# std works all the time, but norm works better when there is normal distribution

from sklearn.preprocessing import StandardScaler
X_train[:,3:] = StandardScaler().fit_transform(X_train[:,3:])
X_test[:,3:] = StandardScaler().fit_transform(X_test[:,3:])

In [None]:
X_train

array([[0.0, 1.0, 0.0, 0.2630675731713538, 0.1238147854838185],
       [1.0, 0.0, 0.0, -0.25350147960148617, 0.4617563176278856],
       [0.0, 0.0, 1.0, -1.9753983221776195, -1.5309334063940294],
       [0.0, 0.0, 1.0, 0.05261351463427101, -1.1114197802841526],
       [1.0, 0.0, 0.0, 1.6405850472322605, 1.7202971959575162],
       [0.0, 0.0, 1.0, -0.08131179534387283, -0.16751412153692966],
       [1.0, 0.0, 0.0, 0.9518263102018072, 0.9861483502652316],
       [1.0, 0.0, 0.0, -0.5978808481167128, -0.48214934111933727]],
      dtype=object)

In [None]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0]], dtype=object)

**Simple Linear Regression**
1. Ordinary Least Squares

y = b0 + b1*x
in ols the sqaure of distance between the actual and the predicted is minimum