# Assignment 3

### Importing libraries

In [1]:
import pandas as pd
import numpy as np

### Importing the dataset

In [2]:
dataset = pd.read_csv("AWCustomers.csv")
dataset.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,LastUpdated
0,21173,,Chad,C,Yuan,,7090 C. Mount Hood,,Wollongong,New South Wales,...,Bachelors,Clerical,M,M,1,3,0,1,81916,2017-03-06
1,13249,,Ryan,,Perry,,3651 Willow Lake Rd,,Shawnee,British Columbia,...,Partial College,Clerical,M,M,1,2,1,2,81076,2017-03-06
2,29350,,Julia,,Thompson,,1774 Tice Valley Blvd.,,West Covina,California,...,Bachelors,Clerical,F,S,0,3,0,0,86387,2017-03-06
3,13503,,Theodore,,Gomez,,2103 Baldwin Dr,,Liverpool,England,...,Partial College,Skilled Manual,M,M,1,2,1,2,61481,2017-03-06
4,22803,,Marshall,J,Shan,,Am Gallberg 234,,Werne,Nordrhein-Westfalen,...,Partial College,Skilled Manual,M,S,1,1,0,0,51804,2017-03-06


### Extracting number of null cells of the features

In [3]:
dataset.isna().sum()

CustomerID                  0
Title                   18260
FirstName                   0
MiddleName               7789
LastName                    0
Suffix                  18358
AddressLine1                0
AddressLine2            18050
City                        0
StateProvinceName           0
CountryRegionName           0
PostalCode                  0
PhoneNumber                 0
BirthDate                   0
Education                   0
Occupation                  0
Gender                      0
MaritalStatus               0
HomeOwnerFlag               0
NumberCarsOwned             0
NumberChildrenAtHome        0
TotalChildren               0
YearlyIncome                0
LastUpdated                 0
dtype: int64

In [4]:
dataset.shape

(18361, 24)

## Part - I

### (a)

In [5]:
filtered_dataset = dataset.drop(['Title','FirstName','LastName','Suffix','AddressLine2','City','PhoneNumber','Education','LastUpdated'],axis = 1)

### (b)

In [6]:
filtered_dataset

Unnamed: 0,CustomerID,MiddleName,AddressLine1,StateProvinceName,CountryRegionName,PostalCode,BirthDate,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,21173,C,7090 C. Mount Hood,New South Wales,Australia,2500,1987-11-13,Clerical,M,M,1,3,0,1,81916
1,13249,,3651 Willow Lake Rd,British Columbia,Canada,V9B 2C3,1972-07-21,Clerical,M,M,1,2,1,2,81076
2,29350,,1774 Tice Valley Blvd.,California,United States,91791,1985-11-09,Clerical,F,S,0,3,0,0,86387
3,13503,,2103 Baldwin Dr,England,United Kingdom,L4 4HB,1977-10-18,Skilled Manual,M,M,1,2,1,2,61481
4,22803,J,Am Gallberg 234,Nordrhein-Westfalen,Germany,59368,1975-02-05,Skilled Manual,M,S,1,1,0,0,51804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18356,25414,C,6627 Camelback Ct.,California,United States,92118,1990-11-11,Skilled Manual,F,M,0,1,0,2,52953
18357,11459,,9627 Kendall Rd,New South Wales,Australia,2444,1992-10-13,Skilled Manual,F,S,0,2,0,0,60992
18358,12160,,4364 Viera Avenue,Oregon,United States,97005,1983-11-24,Skilled Manual,F,S,0,2,0,0,51859
18359,14353,I,3866 Mt. Everest Court,British Columbia,Canada,V7L 4J4,1995-06-15,Clerical,F,S,0,0,0,0,87177


### (c)

| Attribute             | Discrete / Continuous   | Measurement Scale | Notes for Preprocessing                                             |
| --------------------- | ----------------------- | ----------------- | ------------------------------------------------------------------- |
| **CustomerID**        | Discrete                | Nominal           | Identifier, drop for modeling                                       |
| **AddressLine1**      | Discrete                | Nominal           | High-cardinality text, drop or encode only if location-level needed |
| **StateProvinceName** | Discrete                | Nominal           | One-hot encode if location is relevant                              |
| **CountryRegionName** | Discrete                | Nominal           | One-hot encode                                                      |
| **PostalCode**        | Discrete                | Nominal           | Drop unless used for grouping into regions                          |
| **BirthDate**         | Continuous (date → age) | Ratio             | Convert to **Age**, then normalize/standardize                      |
| **Education**         | Discrete                | Ordinal           | Map to ordered integers (e.g., High School < Bachelor’s < Master’s) |
| **Occupation**        | Discrete                | Nominal           | One-hot encode                                                      |
| **HomeOwnerFlag**     | Discrete                | Nominal (binary)  | Map to 0/1                                                          |
| **NumberCarsOwned**   | Discrete                | Ratio             | Keep as is (integer)                                                |
| **TotalChildren**     | Discrete                | Ratio             | Keep as is (integer)                                                |
| **YearlyIncome**      | Continuous              | Ratio             | Normalize/standardize                                               |


## Part - II

### (a)

In [7]:
filtered_dataset.isna().sum()

CustomerID                 0
MiddleName              7789
AddressLine1               0
StateProvinceName          0
CountryRegionName          0
PostalCode                 0
BirthDate                  0
Occupation                 0
Gender                     0
MaritalStatus              0
HomeOwnerFlag              0
NumberCarsOwned            0
NumberChildrenAtHome       0
TotalChildren              0
YearlyIncome               0
dtype: int64

In [10]:
filtered_dataset = filtered_dataset.fillna(method='pad')
filtered_dataset

  filtered_dataset = filtered_dataset.fillna(method='pad')


Unnamed: 0,CustomerID,MiddleName,AddressLine1,StateProvinceName,CountryRegionName,PostalCode,BirthDate,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,21173,C,7090 C. Mount Hood,New South Wales,Australia,2500,1987-11-13,Clerical,M,M,1,3,0,1,81916
1,13249,C,3651 Willow Lake Rd,British Columbia,Canada,V9B 2C3,1972-07-21,Clerical,M,M,1,2,1,2,81076
2,29350,C,1774 Tice Valley Blvd.,California,United States,91791,1985-11-09,Clerical,F,S,0,3,0,0,86387
3,13503,C,2103 Baldwin Dr,England,United Kingdom,L4 4HB,1977-10-18,Skilled Manual,M,M,1,2,1,2,61481
4,22803,J,Am Gallberg 234,Nordrhein-Westfalen,Germany,59368,1975-02-05,Skilled Manual,M,S,1,1,0,0,51804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18356,25414,C,6627 Camelback Ct.,California,United States,92118,1990-11-11,Skilled Manual,F,M,0,1,0,2,52953
18357,11459,C,9627 Kendall Rd,New South Wales,Australia,2444,1992-10-13,Skilled Manual,F,S,0,2,0,0,60992
18358,12160,C,4364 Viera Avenue,Oregon,United States,97005,1983-11-24,Skilled Manual,F,S,0,2,0,0,51859
18359,14353,I,3866 Mt. Everest Court,British Columbia,Canada,V7L 4J4,1995-06-15,Clerical,F,S,0,0,0,0,87177


In [11]:
filtered_dataset.isna().sum()

CustomerID              0
MiddleName              0
AddressLine1            0
StateProvinceName       0
CountryRegionName       0
PostalCode              0
BirthDate               0
Occupation              0
Gender                  0
MaritalStatus           0
HomeOwnerFlag           0
NumberCarsOwned         0
NumberChildrenAtHome    0
TotalChildren           0
YearlyIncome            0
dtype: int64

In [17]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
filtered_dataset['YearlyIncome']=scaler.fit_transform(filtered_dataset['YearlyIncome'].values.reshape(-1, 1)).flatten()

In [18]:
filtered_dataset

Unnamed: 0,CustomerID,MiddleName,AddressLine1,StateProvinceName,CountryRegionName,PostalCode,BirthDate,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,21173,C,7090 C. Mount Hood,New South Wales,Australia,2500,1987-11-13,Clerical,M,M,1,3,0,1,0.496842
1,13249,C,3651 Willow Lake Rd,British Columbia,Canada,V9B 2C3,1972-07-21,Clerical,M,M,1,2,1,2,0.489453
2,29350,C,1774 Tice Valley Blvd.,California,United States,91791,1985-11-09,Clerical,F,S,0,3,0,0,0.536172
3,13503,C,2103 Baldwin Dr,England,United Kingdom,L4 4HB,1977-10-18,Skilled Manual,M,M,1,2,1,2,0.317083
4,22803,J,Am Gallberg 234,Nordrhein-Westfalen,Germany,59368,1975-02-05,Skilled Manual,M,S,1,1,0,0,0.231958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18356,25414,C,6627 Camelback Ct.,California,United States,92118,1990-11-11,Skilled Manual,F,M,0,1,0,2,0.242065
18357,11459,C,9627 Kendall Rd,New South Wales,Australia,2444,1992-10-13,Skilled Manual,F,S,0,2,0,0,0.312781
18358,12160,C,4364 Viera Avenue,Oregon,United States,97005,1983-11-24,Skilled Manual,F,S,0,2,0,0,0.232442
18359,14353,I,3866 Mt. Everest Court,British Columbia,Canada,V7L 4J4,1995-06-15,Clerical,F,S,0,0,0,0,0.543121
