# Sanitizing data correctly

## Considering the current dataset

In [1]:
import pandas as pd

sanitize_df = pd.read_csv("PollData.csv")
print(sanitize_df)

       Date FirstName   LastName                          PrimaryAddress  Age  \
0  01/15/23    George      Smith        123 Anywhere, Anywhere, WI 59999   44   
1  01/15/23     Sally      Jones      123 Somewhere, Somewhere, NV 89503   32   
2  01/15/23     Renee     Walker          123 Nowhere, Nowhere, CA 90011   49   
3  01/16/23   Saniago  Dominguez      123 Downthere, Downthere, MN 55144   50   
4  01/16/23  Abdullah      Brown          123 Upthere, Upthere, FL 33052    0   
5  01/16/23    Fenhua       Yang  123 Aroundthere, Aroundthere, NY 10008   89   

   Income Gender ProductColor  Price  Gadget1  Gadget2  
0   40000      M         Blue    199     True    False  
1   80000      F          Red    169    False    False  
2   60000      X       Purple    179     True     True  
3  100000      N        Green    209     True     True  
4   60000      M       Yellow    179    False     True  
5   80000      N          Red    199     True     True  


## Removing personally identifiable information

In [2]:
ids = range(0, len(sanitize_df))
sanitize_df["Id"] = ids

In [3]:
saved_df = sanitize_df[[
    "Id", "FirstName", 
    "LastName", "PrimaryAddress"]]
print(saved_df)

saved_df.to_csv("SavedData.csv")

   Id FirstName   LastName                          PrimaryAddress
0   0    George      Smith        123 Anywhere, Anywhere, WI 59999
1   1     Sally      Jones      123 Somewhere, Somewhere, NV 89503
2   2     Renee     Walker          123 Nowhere, Nowhere, CA 90011
3   3   Saniago  Dominguez      123 Downthere, Downthere, MN 55144
4   4  Abdullah      Brown          123 Upthere, Upthere, FL 33052
5   5    Fenhua       Yang  123 Aroundthere, Aroundthere, NY 10008


In [4]:
sanitize_df.drop([
    "FirstName", "LastName", "PrimaryAddress"], 
    axis='columns', inplace=True)
print(sanitize_df)

       Date  Age  Income Gender ProductColor  Price  Gadget1  Gadget2  Id
0  01/15/23   44   40000      M         Blue    199     True    False   0
1  01/15/23   32   80000      F          Red    169    False    False   1
2  01/15/23   49   60000      X       Purple    179     True     True   2
3  01/16/23   50  100000      N        Green    209     True     True   3
4  01/16/23    0   60000      M       Yellow    179    False     True   4
5  01/16/23   89   80000      N          Red    199     True     True   5


### Adding traits together to make them less identifiable

In [5]:
averageAge = sanitize_df['Age'].mean()
sanitize_df['Age'] = \
    [averageAge if x == 0 else x 
     for x in sanitize_df['Age']]
print(sanitize_df)

       Date   Age  Income Gender ProductColor  Price  Gadget1  Gadget2  Id
0  01/15/23  44.0   40000      M         Blue    199     True    False   0
1  01/15/23  32.0   80000      F          Red    169    False    False   1
2  01/15/23  49.0   60000      X       Purple    179     True     True   2
3  01/16/23  50.0  100000      N        Green    209     True     True   3
4  01/16/23  44.0   60000      M       Yellow    179    False     True   4
5  01/16/23  89.0   80000      N          Red    199     True     True   5


In [6]:
def AgeLevel(Age):
    if Age >= 1 and Age <= 19:
        return 1
    elif Age >= 20 and Age <= 29:
        return 8
    elif Age >= 30 and Age <= 39:
        return 15
    elif Age >= 40 and Age <= 49:
        return 22
    elif Age >= 50 and Age <= 59:
        return 29
    elif Age >= 60 and Age <= 69:
        return 36
    elif Age >= 70:
        return 43

In [7]:
def IncomeLevel(Income):
    if Income == 0:
        return 0
    elif Income == 20000:
        return 1
    elif Income == 40000:
        return 2
    elif Income == 60000:
        return 3
    elif Income == 80000:
        return 4
    elif Income == 100000:
        return 5
    elif Income == 120000:
        return 6

In [8]:
def GroupValue(Age = 1, Income = 0):
    Group = AgeLevel(Age) + IncomeLevel(Income)
    return Group

In [9]:
GroupList = []
for Age, Income in \
    zip(sanitize_df['Age'], sanitize_df['Income']):
        
        GroupList.append(GroupValue(Age, Income))
print(GroupList)

[24, 19, 25, 34, 25, 47]


In [10]:
sanitize_df['Group'] = GroupList
print(sanitize_df)

       Date   Age  Income Gender ProductColor  Price  Gadget1  Gadget2  Id  \
0  01/15/23  44.0   40000      M         Blue    199     True    False   0   
1  01/15/23  32.0   80000      F          Red    169    False    False   1   
2  01/15/23  49.0   60000      X       Purple    179     True     True   2   
3  01/16/23  50.0  100000      N        Green    209     True     True   3   
4  01/16/23  44.0   60000      M       Yellow    179    False     True   4   
5  01/16/23  89.0   80000      N          Red    199     True     True   5   

   Group  
0     24  
1     19  
2     25  
3     34  
4     25  
5     47  


In [11]:
sanitize_df.drop([
    "Age", "Income"], 
    axis='columns', inplace=True)
print(sanitize_df)

       Date Gender ProductColor  Price  Gadget1  Gadget2  Id  Group
0  01/15/23      M         Blue    199     True    False   0     24
1  01/15/23      F          Red    169    False    False   1     19
2  01/15/23      X       Purple    179     True     True   2     25
3  01/16/23      N        Green    209     True     True   3     34
4  01/16/23      M       Yellow    179    False     True   4     25
5  01/16/23      N          Red    199     True     True   5     47


## Eliminating unnecessary features

In [12]:
sanitize_df.drop(["Gender"], 
    axis='columns', inplace=True)
print(sanitize_df)

       Date ProductColor  Price  Gadget1  Gadget2  Id  Group
0  01/15/23         Blue    199     True    False   0     24
1  01/15/23          Red    169    False    False   1     19
2  01/15/23       Purple    179     True     True   2     25
3  01/16/23        Green    209     True     True   3     34
4  01/16/23       Yellow    179    False     True   4     25
5  01/16/23          Red    199     True     True   5     47
