<a href="https://colab.research.google.com/github/Shashank975/Practice-Projects-ML/blob/main/Standardization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing the Nesessary Library

In [158]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler


#Fetching the Data

In [159]:
df = pd.read_csv("/content/Social_Network_Ads.csv")
df.sample(5)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
64,15605000,Female,59,83000,0
289,15713912,Female,37,78000,1
122,15724423,Female,40,75000,0
283,15663249,Female,52,21000,1
392,15748589,Female,45,45000,1


In [160]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [161]:
df.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [162]:
df.columns

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [163]:
df.isnull().sum()

Unnamed: 0,0
User ID,0
Gender,0
Age,0
EstimatedSalary,0
Purchased,0


In [164]:
df.drop("User ID",axis=1,inplace=True)

#```Note :```
**Gender: This is a categorical variable encoded as 0 and 1. Even though it's represented as numbers, you might not need to scale it because it already acts as a category. Scaling could change the meaning.**

In [165]:
df["Gender"]=df["Gender"].map({"Male":1,"Female":0})
df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0
...,...,...,...,...
395,0,46,41000,1
396,1,51,23000,1
397,0,50,20000,1
398,1,36,33000,0


#Now we are spliting the data

In [166]:
from sklearn.model_selection import train_test_split
X = df.drop("Purchased",axis=1)
y = df["Purchased"]

In [167]:
X.shape

(400, 3)

In [168]:
y.shape

(400,)

In [169]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify = y)

In [170]:
for i in [X_train,X_test,y_train,y_test]:
  print(i.shape)

(320, 3)
(80, 3)
(320,)
(80,)


#```Note:```
**Age & EstimatedSalary: These are continuous numerical features.
Scaling: These are the ones you typically want to scale, so that they have a similar scale.**

In [171]:
scaler = StandardScaler()

In [172]:
feature_to_scaled = ["Age","EstimatedSalary"]
feature_to_scaled

['Age', 'EstimatedSalary']

In [173]:
# Fit the scaler on the training data for the numerical features
X_train_scaled_numerical = scaler.fit_transform(X_train[feature_to_scaled])
X_test_scaled_numerical = scaler.transform(X_test[feature_to_scaled])

In [174]:
# Create copies of the training and test sets to include both scaled and unscaled features
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

In [175]:
# Replace the numerical columns with their scaled versions
X_train_scaled[feature_to_scaled] = X_train_scaled_numerical
X_test_scaled[feature_to_scaled] = X_test_scaled_numerical

In [176]:
X_train_scaled.head(5)

Unnamed: 0,Gender,Age,EstimatedSalary
65,1,-1.234462,-0.367992
179,0,-0.577646,-1.058314
109,0,0.07917,0.264803
379,0,1.955786,-1.374712
325,0,0.360662,-0.310465


In [177]:
# Check the results
print("Scaled Training Features:\n", X_train_scaled)
print("\nScaled Test Features:\n", X_test_scaled)

Scaled Training Features:
      Gender       Age  EstimatedSalary
65        1 -1.234462        -0.367992
179       0 -0.577646        -1.058314
109       0  0.079170         0.264803
379       0  1.955786        -1.374712
325       0  0.360662        -0.310465
..      ...       ...              ...
4         1 -1.703616         0.149750
18        1  0.829816        -1.230895
314       0  0.173001         0.236040
170       1 -1.515954         0.494911
106       0 -1.046800        -1.029551

[320 rows x 3 columns]

Scaled Test Features:
      Gender       Age  EstimatedSalary
331       0  1.017478         1.386577
92        1 -1.046800        -1.604819
1         1 -0.202323        -1.461002
234       0  0.079170         1.185233
136       0 -1.609785         0.322330
..      ...       ...              ...
393       1  2.143448        -0.828207
151       1  0.360662        -0.741916
126       1  0.454493        -0.166648
265       0  0.454493         1.070179
286       0 -0.014661       

#Now we using our model for the prediction .

In [178]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [179]:
model = LogisticRegression()

In [180]:
#train the model
model.fit(X_train_scaled,y_train)

In [181]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

In [182]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [183]:
print("Accuracy:", accuracy*100)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy: 81.25

Confusion Matrix:
 [[47  4]
 [11 18]]

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.92      0.86        51
           1       0.82      0.62      0.71        29

    accuracy                           0.81        80
   macro avg       0.81      0.77      0.78        80
weighted avg       0.81      0.81      0.81        80



In [195]:
# New data that you want to predict on
new_data = pd.DataFrame({
    'Gender': [0],
    'Age': [50],
    'EstimatedSalary': [50]
})

# Step 1: Scale the numerical features in the new data
new_data_scaled = new_data.copy()
new_data_scaled[feature_to_scaled] = scaler.transform(new_data[feature_to_scaled])

# Step 2: Use the trained model to predict
prediction = model.predict(new_data_scaled)

# Step 3: Output the result
if prediction == 1:
    print("The person will buy the product")
else:
    print("The person will not buy the product")


The person will not buy the product
