In [1]:
import pandas as pd
import matplotlib.pylab as pl
import matplotlib.patches as patches

In [2]:
# dataframe 显示所有数据
pd.set_option('display.max_rows', 50)

In [3]:
# 属性列名
names = (
    'age',
    'workclass', 
    'fnlwgt', 
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
)
# 分类数据
categorical = set((
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'sex',
    'native-country',
    'race',
    'income',
))

In [4]:
df = pd.read_csv("adult.data.txt", sep=", ", header=None, names=names, index_col=False, engine='python')

In [5]:
df.head(80)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50k
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50k
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50k
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50k
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50k
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,27,Private,213921,HS-grad,9,Never-married,Other-service,Own-child,White,Male,0,0,40,Mexico,<=50k
76,40,Private,32214,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50k
77,67,?,212759,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50k
78,18,Private,309634,11th,7,Never-married,Other-service,Own-child,White,Female,0,0,22,United-States,<=50k


In [6]:
#astype()方法显式地把一种数据类型转换为另一种
for name in categorical:
    df[name] = df[name].astype('category')

In [7]:
# 计算分区中所有列的跨度，如果给定了规模，那么每列的跨度值会被该列的规模除，得到的就是相对跨度
def get_spans(df, partition, scale=None):
    spans = {}
    for column in df.columns:
        if column in categorical:
            span = len(df[column][partition].unique())
        else:
            span = df[column][partition].max()-df[column][partition].min()
        if scale is not None:
            span = span/scale[column]
        spans[column] = span
    return spans

In [8]:
full_spans = get_spans(df, df.index)

In [9]:
full_spans

{'age': 73,
 'workclass': 9,
 'fnlwgt': 1472420,
 'education': 16,
 'education-num': 15,
 'marital-status': 7,
 'occupation': 15,
 'relationship': 6,
 'race': 5,
 'sex': 2,
 'capital-gain': 99999,
 'capital-loss': 4356,
 'hours-per-week': 98,
 'native-country': 42,
 'income': 2}

In [10]:
def split(df, partition, column):
    
    dfp = df[column][partition]
    if column in categorical:
        values = dfp.unique()
        lv = set(values[:len(values)//2])
        rv = set(values[len(values)//2:])
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
    else:        
        median = dfp.median()
        dfl = dfp.index[dfp < median]
        dfr = dfp.index[dfp >= median]
        return (dfl, dfr)

In [11]:
def is_k_anonymous(df, partition, sensitive_column, k=5):
   
    if len(partition) < k:
        return False
    return True

def partition_dataset(df, feature_columns, sensitive_column, scale, is_valid):
    
    finished_partitions = []
    partitions = [df.index]
    while partitions:
        partition = partitions.pop(0)
        spans = get_spans(df[feature_columns], partition, scale)
        for column, span in sorted(spans.items(), key=lambda x:-x[1]):
            lp, rp = split(df, partition, column)
            if not is_valid(df, lp, sensitive_column) or not is_valid(df, rp, sensitive_column):
                continue
            partitions.extend((lp, rp))
            break
        else:
            finished_partitions.append(partition)
    return finished_partitions

In [12]:
feature_columns = [
    'age',
    'workclass',
    'fnlwgt', 
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country'
]
sensitive_column = 'income'
finished_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, is_k_anonymous)
print("OK")

OK


In [13]:
len(finished_partitions)

5074

In [14]:
def agg_categorical_column(series):
    return [','.join(set(series))]

def agg_numerical_column(series):
    return series.mean()

In [15]:
def build_anonymized_dataset(df, partitions, feature_columns, sensitive_column, max_partitions=None):
    aggregations = {}
    for column in feature_columns:
        if column in categorical:
            aggregations[column] = agg_categorical_column
        else:
            aggregations[column] = agg_numerical_column
    rows = []
    for i, partition in enumerate(partitions):
        if i % 100 == 1:
            print("Finished {} partitions...".format(i))
        if max_partitions is not None and i > max_partitions:
            break
        grouped_columns = df.loc[partition].agg(aggregations, squeeze=False)
        sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({sensitive_column : 'count'})
        #xdu_qinian insert
        df2=grouped_columns.to_frame()
        grouped_columns=pd.DataFrame(df2.values.T,columns=df2.index)
        #insert_end
        values = grouped_columns.iloc[0].to_dict()
        for sensitive_value, count in sensitive_counts[sensitive_column].items():
            if count == 0:
                continue
            values.update({
                sensitive_column : sensitive_value,
                'count' : count,

            })
            rows.append(values.copy())
    return pd.DataFrame(rows)

In [16]:
dfn = build_anonymized_dataset(df, finished_partitions, feature_columns, sensitive_column)

Finished 1 partitions...
Finished 101 partitions...
Finished 201 partitions...
Finished 301 partitions...
Finished 401 partitions...
Finished 501 partitions...
Finished 601 partitions...
Finished 701 partitions...
Finished 801 partitions...
Finished 901 partitions...
Finished 1001 partitions...
Finished 1101 partitions...
Finished 1201 partitions...
Finished 1301 partitions...
Finished 1401 partitions...
Finished 1501 partitions...
Finished 1601 partitions...
Finished 1701 partitions...
Finished 1801 partitions...
Finished 1901 partitions...
Finished 2001 partitions...
Finished 2101 partitions...
Finished 2201 partitions...
Finished 2301 partitions...
Finished 2401 partitions...
Finished 2501 partitions...
Finished 2601 partitions...
Finished 2701 partitions...
Finished 2801 partitions...
Finished 2901 partitions...
Finished 3001 partitions...
Finished 3101 partitions...
Finished 3201 partitions...
Finished 3301 partitions...
Finished 3401 partitions...
Finished 3501 partitions...
Fini

In [17]:
dfn=dfn.apply(lambda x: x.explode().astype(str).groupby(level=0).agg(", ".join))

In [18]:
dfn

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,count
0,25.875,"Self-emp-inc,?,Never-worked",190725.625,"Some-college,7th-8th,10th,HS-grad",8.375,"Divorced,Separated","Prof-specialty,Adm-clerical,?","Own-child,Unmarried,Not-in-family,Other-relative","Amer-Indian-Eskimo,White",Male,0.0,0.0,29.125,United-States,<=50k,8
1,27.0,"Local-gov,?",219007.6,"5th-6th,9th",4.2,"Married-civ-spouse,Never-married","Craft-repair,?,Handlers-cleaners","Other-relative,Husband","Black,White",Male,0.0,343.8,36.0,"Trinadad&Tobago,El-Salvador,Mexico,United-Stat...",<=50k,3
2,27.0,"Local-gov,?",219007.6,"5th-6th,9th",4.2,"Married-civ-spouse,Never-married","Craft-repair,?,Handlers-cleaners","Other-relative,Husband","Black,White",Male,0.0,343.8,36.0,"Trinadad&Tobago,El-Salvador,Mexico,United-Stat...",>50k,2
3,29.166666666666668,"Local-gov,?",213259.16666666666,"5th-6th,9th",4.0,"Separated,Divorced,Never-married","?,Other-service",Unmarried,"Black,Other,White",Female,0.0,0.0,39.333333333333336,"United-States,El-Salvador,Mexico",<=50k,6
4,42.0,"Self-emp-not-inc,Local-gov,Private",153341.88888888888,"Some-college,Bachelors,9th,HS-grad,11th",9.777777777777779,"Married-spouse-absent,Married-AF-spouse,Separa...","Handlers-cleaners,Prof-specialty,Transport-mov...","Own-child,Husband","Asian-Pac-Islander,Black,Other,White",Male,810.8888888888889,0.0,41.77777777777778,"United-States,?,Mexico",<=50k,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7699,43.2,Private,113757.2,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,620.6,0.0,40.0,United-States,>50k,4
7700,42.0,Private,152689.2,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,1663.0,0.0,40.0,United-States,<=50k,1
7701,42.0,Private,152689.2,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,1663.0,0.0,40.0,United-States,>50k,4
7702,43.6,Private,146092.8,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,1459.6,0.0,40.0,United-States,<=50k,1


In [19]:
dfn.sort_values(feature_columns+[sensitive_column])

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,count
3371,17.0,?,126071.66666666667,10th,6.0,Never-married,?,Own-child,White,Male,0.0,0.0,40.0,United-States,<=50k,6
1755,17.0,Private,168794.4,10th,6.0,Never-married,Other-service,Own-child,White,Female,0.0,0.0,13.2,United-States,<=50k,5
1772,17.0,Private,199233.0,11th,7.0,Never-married,Sales,Own-child,White,Female,0.0,320.4,14.2,United-States,<=50k,5
1208,17.0,Private,236435.66666666666,10th,6.0,Never-married,Other-service,Own-child,White,Male,0.0,0.0,19.0,United-States,<=50k,6
1767,17.0,Private,69810.8,11th,7.0,Never-married,"Sales,Adm-clerical",Own-child,White,Female,422.0,0.0,19.8,"United-States,India",<=50k,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2624,79.83333333333333,"Self-emp-not-inc,Local-gov",128250.5,"HS-grad,Some-college,11th",8.833333333333334,"Divorced,Married-civ-spouse,Never-married","Craft-repair,Sales,Exec-managerial","Not-in-family,Husband",White,Male,885.0,0.0,10.5,United-States,<=50k,5
2625,79.83333333333333,"Self-emp-not-inc,Local-gov",128250.5,"HS-grad,Some-college,11th",8.833333333333334,"Divorced,Married-civ-spouse,Never-married","Craft-repair,Sales,Exec-managerial","Not-in-family,Husband",White,Male,885.0,0.0,10.5,United-States,>50k,1
5134,81.0,"Local-gov,?",166264.16666666666,HS-grad,9.0,Married-civ-spouse,"?,Other-service",Husband,White,Male,1127.8333333333333,0.0,40.0,"Canada,Cuba,United-States",<=50k,3
5135,81.0,"Local-gov,?",166264.16666666666,HS-grad,9.0,Married-civ-spouse,"?,Other-service",Husband,White,Male,1127.8333333333333,0.0,40.0,"Canada,Cuba,United-States",>50k,3
