In [5]:
import pandas as pd
from collections import OrderedDict        
from copy import deepcopy

In [52]:
demogroups = {
	'age'      : ('18', '18_24', '25_34', '35_44', '45_54', '55_64', '65'),
	'education': ('College', 'Grad_School', 'No_College'),
	'gender'   : ('Female', 'Male'),
	'income'   : ('0-50k', '50-100k', '100-150k', '150k'),
	'kids'     : ('Has_Kids', 'No_Kids'),
	'race_US'  : ('Caucasian', 'African_American', 'Asian' , 'Hispanic')
}

priors = {
    'age'       : {"25_34": 0.173, "18": 0.181, "55_64": 0.102, "65": 0.052, "18_24": 0.127, "45_54": 0.175, "35_44": 0.193},
    'education' : {'No_College': 0.447, 'College': 0.408, 'Grad_School': 0.145},
    'kids'      : {'Has_Kids': 0.507, 'No_Kids': 0.493},
    'income'    : {'0-50k': 0.512, '50-100k': 0.283, '100-150k': 0.118, '150k': 0.082},
    'race_US'   : {'Caucasian': 0.762, 'African_American': 0.095, 'Asian': 0.047 , 'Hispanic': 0.096},
    'gender'    : {'Male': 0.5,'Female': 0.5}
}    

In [6]:
all_cats = OrderedDict()
for category, bins in demogroups.iteritems():
    for bin in bins:
        all_cats[bin] = None

In [7]:
cols = all_cats.keys()
print cols

['Has_Kids', 'No_Kids', 'Caucasian', 'African_American', 'Asian', 'Hispanic', 'Female', 'Male', '18', '18_24', '25_34', '35_44', '45_54', '55_64', '65', '0-50k', '50-100k', '100-150k', '150k', 'College', 'Grad_School', 'No_College']


In [69]:
test_pages =[

    {"name": 'radaronline.com',"Male":45,"Female":153,"18":49,"18_24":52,"25_34":125,"35_44":143,"45_54":119,"55_64":113,"65":68,"No_Kids":82,"Has_Kids":118,"0-50k":89,"50-100k":111,"100-150k":110,"150k":112,"No_College":81,"College":120,"Grad_School":101,"Caucasian":92,"African_American":150,"Asian":62,"Hispanic":132,"Other":105},
    {"name": 'looper.com'    ,"Male":125,"Female":76,"18":82,"18_24":66,"25_34":96,"35_44":120,"45_54":114,"55_64":117,"65":103,"No_Kids":91,"Has_Kids":109,"0-50k":95,"50-100k":106,"100-150k":104,"150k":104,"No_College":84,"College":114,"Grad_School":111,"Caucasian":97,"African_American":119,"Asian":73,"Hispanic":116,"Other":104},
    {"name":'usmagazine.com' ,"Male":34,"Female":163,"18":51,"18_24":72,"25_34":141,"35_44":142,"45_54":105,"55_64":93,"65":53,"No_Kids":85,"Has_Kids":116,"0-50k":92,"50-100k":107,"100-150k":110,"150k":110,"No_College":80,"College":119,"Grad_School":107,"Caucasian":97,"African_American":99,"Asian":97,"Hispanic":125,"Other":98},
    {"name":'goodreads.com' ,"Male":60,"Female":138,"18":93,"18_24":128,"25_34":116,"35_44":99,"45_54":89,"55_64":86,"65":75,"No_Kids":104,"Has_Kids":95,"0-50k":101,"50-100k":102,"100-150k":97,"150k":94,"No_College":73,"College":112,"Grad_School":150,"Caucasian":104,"African_American":79,"Asian":103,"Hispanic":88,"Other":98},
    {"name": 'al.com'       ,"Male":133,"Female":68,"18":69,"18_24":64,"25_34":95,"35_44":112,"45_54":130,"55_64":125,"65":118,"No_Kids":95,"Has_Kids":105,"0-50k":89,"50-100k":108,"100-150k":122,"150k":109,"No_College":69,"College":120,"Grad_School":141,"Caucasian":104,"African_American":155,"Asian":43,"Hispanic":41,"Other":91}
]
    

In [40]:
one_site = {"name": 'radaronline.com',"Male":45,"Female":153,"18":49,"18_24":52,"25_34":125,"35_44":143,"45_54":119,"55_64":113,"65":68,"No_Kids":82,"Has_Kids":118,"0-50k":89,"50-100k":111,"100-150k":110,"150k":112,"No_College":81,"College":120,"Grad_School":101,"Caucasian":92,"African_American":150,"Asian":62,"Hispanic":132,"Other":105}
 

In [46]:
float("{0:.2f}".format(5.66666))

5.67

In [120]:
def calc_local(pop_prob, index_value):
    return float("{0:.2f}".format(pop_prob * (index_value / 100.0)))

def calc_local_for_whole_page(indexes):
    local_values = {}
    missing      = {}
    for gr_name, gr in priors.iteritems():
        for category, percentage in gr.iteritems():
            if indexes.get(category):
                local_values[category] =  calc_local(percentage, indexes[category])
            else:
                if not missing.get(gr_name):
                    missing[gr_name] = []
                missing[gr_name].append(category)
    
    for gr_name, cats in missing.iteritems():
        local_sum_missing = 1
        for category in priors[gr_name].keys():
            if local_values.get(category):
                local_sum_missing -= local_values.get(category)
        prior_sum_missing = 0
        for category in cats:
            prior_sum_missing += priors[gr_name][category]
        
        index_of_missing_mean = (local_sum_missing / prior_sum_missing) * 100
        
        for category in cats:
            local_values[category] = calc_local(priors[gr_name][category] , index_of_missing_mean)
               
        
    return local_values
    

    
    
        
    

In [122]:
calc_local_for_whole_page({"Female":153,"25_34":125,"35_44":143,"45_54":119,"55_64":113,"Has_Kids":118,"50-100k":111,"100-150k":110,"150k":112,"College":120,"Grad_School":101,"African_American":150,"Hispanic":132,"Other":105}
 )

{'0-50k': 0.47,
 '100-150k': 0.13,
 '150k': 0.09,
 '18': 0.09,
 '18_24': 0.06,
 '25_34': 0.22,
 '35_44': 0.28,
 '45_54': 0.21,
 '50-100k': 0.31,
 '55_64': 0.12,
 '65': 0.02,
 'African_American': 0.14,
 'Asian': 0.04,
 'Caucasian': 0.69,
 'College': 0.49,
 'Female': 0.77,
 'Grad_School': 0.15,
 'Has_Kids': 0.6,
 'Hispanic': 0.13,
 'Male': 0.23,
 'No_College': 0.36,
 'No_Kids': 0.4}

In [111]:
calc_local_for_whole_page(one_site)

{'0-50k': 0.46,
 '100-150k': 0.13,
 '150k': 0.09,
 '18': 0.09,
 '18_24': 0.07,
 '25_34': 0.22,
 '35_44': 0.28,
 '45_54': 0.21,
 '50-100k': 0.31,
 '55_64': 0.12,
 '65': 0.04,
 'African_American': 0.14,
 'Asian': 0.03,
 'Caucasian': 0.7,
 'College': 0.49,
 'Female': 0.77,
 'Grad_School': 0.15,
 'Has_Kids': 0.6,
 'Hispanic': 0.13,
 'Male': 0.23,
 'No_College': 0.36,
 'No_Kids': 0.4}

In [109]:
def calc_for_many(pages):
    results =  []
    for page in pages:
        name    = page['name']
        result  = calc_local_for_whole_page(page)
        result['name'] = name
        results.append(result) 
    
    return results 
    

In [126]:
{k:v for k,v in one_site.iteritems() if v <=100}

{'0-50k': 89,
 '18': 49,
 '18_24': 52,
 '65': 68,
 'Asian': 62,
 'Caucasian': 92,
 'Male': 45,
 'No_College': 81,
 'No_Kids': 82}

In [139]:
def calc_for_many_missing(pages):
    results =  []
    for page in pages:
        name    = page['name']
        result  = calc_local_for_whole_page(page)
        
        page_no_negatives = {k:v for k,v in page.iteritems() if v>= 100}
        imputed = calc_local_for_whole_page(page_no_negatives)
        
        result = {k:[v, imputed[k], float("{0:.2f}".format((v-imputed[k])/v)),page[k]] for k,v in result.iteritems()}
        result['name'] = name
        results.append(result) 
    
    return results     

In [140]:
calc_for_many_missing(test_pages)

[{'0-50k': [0.46, 0.47, -0.02, 89],
  '100-150k': [0.13, 0.13, 0.0, 110],
  '150k': [0.09, 0.09, 0.0, 112],
  '18': [0.09, 0.09, 0.0, 49],
  '18_24': [0.07, 0.06, 0.14, 52],
  '25_34': [0.22, 0.22, 0.0, 125],
  '35_44': [0.28, 0.28, 0.0, 143],
  '45_54': [0.21, 0.21, 0.0, 119],
  '50-100k': [0.31, 0.31, 0.0, 111],
  '55_64': [0.12, 0.12, 0.0, 113],
  '65': [0.04, 0.02, 0.5, 68],
  'African_American': [0.14, 0.14, 0.0, 150],
  'Asian': [0.03, 0.04, -0.33, 62],
  'Caucasian': [0.7, 0.69, 0.01, 92],
  'College': [0.49, 0.49, 0.0, 120],
  'Female': [0.77, 0.77, 0.0, 153],
  'Grad_School': [0.15, 0.15, 0.0, 101],
  'Has_Kids': [0.6, 0.6, 0.0, 118],
  'Hispanic': [0.13, 0.13, 0.0, 132],
  'Male': [0.23, 0.23, 0.0, 45],
  'No_College': [0.36, 0.36, 0.0, 81],
  'No_Kids': [0.4, 0.4, 0.0, 82],
  'name': 'radaronline.com'},
 {'0-50k': [0.49, 0.49, 0.0, 95],
  '100-150k': [0.12, 0.12, 0.0, 104],
  '150k': [0.09, 0.09, 0.0, 104],
  '18': [0.15, 0.15, 0.0, 82],
  '18_24': [0.08, 0.11, -0.38, 66],
 

In [5]:
from collections import OrderedDict
all_cats = OrderedDict()
for category, bins in demogroups.iteritems():
    for bin in bins:
        all_cats[bin] = None
        
from copy import deepcopy


In [2]:
all_cats.keys()

NameError: name 'all_cats' is not defined

In [12]:
import csv
def loadModel (model_file):
    model_dic = {}
    with open(model_file,'r') as data_file:
        reader = csv.reader(data_file, delimiter = '\t')
        for line in reader:
            if len(line) == 2: model_dic[line[1]] = int(line[0])

    return model_dic

In [10]:

dir_path = '/Users/mpio/Dropbox/Apps/openPDS.Digital-Halo/model/'

all_sites = {}

def get_all_sites () :


    for bin in all_cats.keys():
        cf = loadModel(dir_path + bin + '.txt-goog.txt')

        for domain, cn in cf.iteritems():
            if domain not in all_sites:
                all_sites[domain] = deepcopy(all_cats)

            all_sites[domain][bin] = cn

In [83]:
df = df.append(ass, ignore_index=True)

In [88]:
df = df.set_index('name')