In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/social-network-ads/Social_Network_Ads.csv")
data = data.iloc[:,2:]

In [None]:
y = data["Purchased"]
X = data.drop(columns="Purchased")
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

**Applying Standardization using sklearn standardscaler**

In [None]:
scaler = StandardScaler()

#It will save mean and sd values
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

#Note: here we learn from train data but apply to test data as well

#The scaled column in sklearn gives us in the form of array

In [None]:
X_train_scaled = pd.DataFrame(scaled_X_train,columns = X_train.columns)
X_test_scaled = pd.DataFrame(scaled_X_test,columns = X_test.columns)

In [None]:
X_train.describe()

In [None]:
np.round(X_train_scaled.describe())
#mean = 0 and sd =1 thus standardization is applid here

**EFFECTS**

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

#before scaling
ax1.scatter(X_train['Age'], X_train['EstimatedSalary'])
ax1.set_title("Before Scaling")
#after scaling
ax2.scatter(X_train_scaled['Age'], X_train_scaled['EstimatedSalary'],color='red')
ax2.set_title("After Scaling")
plt.show()

#As you can see, there is not much of a difference in the distribution
#The mean is shifted a little towards right

In [None]:
#KDE plot
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Before Scaling')
sns.kdeplot(X_train['Age'], ax=ax1)
sns.kdeplot(X_train['EstimatedSalary'], ax=ax1)

# after scaling
ax2.set_title('After Standard Scaling')
sns.kdeplot(X_train_scaled['Age'], ax=ax2)
sns.kdeplot(X_train_scaled['EstimatedSalary'], ax=ax2)
plt.show()

#before scaling : The two variables were not much comparable
#after scaling : There can be some relation seen 

*Note*: Standardization doesn't affect the distribution.

In [None]:
#Age distribution
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Age Distribution Before Scaling')
sns.kdeplot(X_train['Age'], ax=ax1)

# after scaling
ax2.set_title('Age Distribution After Standard Scaling')
sns.kdeplot(X_train_scaled['Age'], ax=ax2)
plt.show()

In [None]:
#Salary distribution
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Salary Distribution Before Scaling')
sns.kdeplot(X_train['EstimatedSalary'], ax=ax1)

# after scaling
ax2.set_title('Salary Distribution Standard Scaling')
sns.kdeplot(X_train_scaled['EstimatedSalary'], ax=ax2)
plt.show()

**Application of standardizatiion on ML Algorithm**<br>
1.Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr_scaled = LogisticRegression()


In [None]:
lr.fit(X_train,y_train)
lr_scaled.fit(X_train_scaled,y_train)


In [None]:
y_pred = lr.predict(X_test)
scaled_y_pred = lr_scaled.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score
print("Normal Accuracy = ",accuracy_score(y_test,y_pred))
print("Scaled Accuracy = ",accuracy_score(y_test,scaled_y_pred))

#This is how scaling affects the accuracy of model in LR

2.Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt_scaled = DecisionTreeClassifier()

In [None]:
dt.fit(X_train,y_train)
dt_scaled.fit(X_train_scaled,y_train)

In [None]:
y_pred = dt.predict(X_test)
y_pred_scaled = dt_scaled.predict(X_test_scaled)

In [None]:
print("Normal Accuracy score = ",accuracy_score(y_test,y_pred))
print("Scaled Accuracy score = ",accuracy_score(y_test,y_pred_scaled))

#decision tree is not affected by scaling

**Effect on Outlier**

In [None]:
#Inserting some outliers in the data
new_data = data.append(pd.DataFrame({'Age':[5,90,95],'EstimatedSalary':[1000,250000,350000],'Purchased':[0,1,1]}),ignore_index=True)


In [None]:
#The plot shows the outliers in the data
plt.scatter(new_data["Age"],new_data["EstimatedSalary"])

In [None]:
#split the data
X_train,X_test,y_train,y_test = train_test_split(new_data.drop(columns="Purchased"),new_data["EstimatedSalary"],random_state=0,test_size=0.2)

#Apply standardization to new data



scaler = StandardScaler()

#It will save mean and sd values
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

#Converting numpy array to dataframe
X_train_scaled = pd.DataFrame(scaled_X_train, columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaled_X_test, columns=X_test.columns)

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

ax1.scatter(X_train['Age'], X_train['EstimatedSalary'])
ax1.set_title("Before Scaling")
ax2.scatter(X_train_scaled['Age'], X_train_scaled['EstimatedSalary'],color='red')
ax2.set_title("After Scaling")
plt.show()

#As the plot shows , the ouliers are still present 
#The outliers effect does not reduces