## Importing Libraries

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import scipy.stats as stats

## Loading Dataset

In [42]:
df = pd.read_csv("Customers.csv")
df

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6
...,...,...,...,...,...,...,...,...
1995,1996,Female,71,184387,40,Artist,8,7
1996,1997,Female,91,73158,32,Doctor,7,7
1997,1998,Male,87,90961,14,Healthcare,9,2
1998,1999,Male,77,182109,4,Executive,7,2


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              2000 non-null   int64 
 1   Gender                  2000 non-null   object
 2   Age                     2000 non-null   int64 
 3   Annual Income ($)       2000 non-null   int64 
 4   Spending Score (1-100)  2000 non-null   int64 
 5   Profession              1965 non-null   object
 6   Work Experience         2000 non-null   int64 
 7   Family Size             2000 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 125.1+ KB


In [44]:
df.describe()

Unnamed: 0,CustomerID,Age,Annual Income ($),Spending Score (1-100),Work Experience,Family Size
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1000.5,48.96,110731.8215,50.9625,4.1025,3.7685
std,577.494589,28.429747,45739.536688,27.934661,3.922204,1.970749
min,1.0,0.0,0.0,0.0,0.0,1.0
25%,500.75,25.0,74572.0,28.0,1.0,2.0
50%,1000.5,48.0,110045.0,50.0,3.0,4.0
75%,1500.25,73.0,149092.75,75.0,7.0,5.0
max,2000.0,99.0,189974.0,100.0,17.0,9.0


## Finding Null Values

In [45]:
df.isnull().sum()

CustomerID                 0
Gender                     0
Age                        0
Annual Income ($)          0
Spending Score (1-100)     0
Profession                35
Work Experience            0
Family Size                0
dtype: int64

In [46]:
df["Profession"].isnull().sum()

35

## Filling Null Values

In [47]:
df["Profession"].fillna(method="ffill",inplace=True)
df["Profession"].isnull().sum()

0

## Normalization using Min Scaler

In [48]:
test_df = df['Annual Income ($)']
test_df

0        15000
1        35000
2        86000
3        59000
4        38000
         ...  
1995    184387
1996     73158
1997     90961
1998    182109
1999    110610
Name: Annual Income ($), Length: 2000, dtype: int64

In [49]:
normalized_df=(test_df-test_df.mean())/test_df.std()
print(normalized_df)

0      -2.092978
1      -1.655719
2      -0.540710
3      -1.131009
4      -1.590130
          ...   
1995    1.610318
1996   -0.821474
1997   -0.432248
1998    1.560514
1999   -0.002663
Name: Annual Income ($), Length: 2000, dtype: float64


## Turning Categorical Feature into Numeric Type

In [50]:
df['Gender'].replace(['Male','Female'],[0,1], inplace=True)

In [51]:
df

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,0,19,15000,39,Healthcare,1,4
1,2,0,21,35000,81,Engineer,3,3
2,3,1,20,86000,6,Engineer,1,1
3,4,1,23,59000,77,Lawyer,0,2
4,5,1,31,38000,40,Entertainment,2,6
...,...,...,...,...,...,...,...,...
1995,1996,1,71,184387,40,Artist,8,7
1996,1997,1,91,73158,32,Doctor,7,7
1997,1998,0,87,90961,14,Healthcare,9,2
1998,1999,0,77,182109,4,Executive,7,2


## Z-Score

In [52]:
stats.zscore(test_df)

0      -2.093501
1      -1.656133
2      -0.540845
3      -1.131292
4      -1.590528
          ...   
1995    1.610720
1996   -0.821679
1997   -0.432356
1998    1.560904
1999   -0.002664
Name: Annual Income ($), Length: 2000, dtype: float64