In [40]:
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Q1 : Read in Data

In [41]:
gold = pd.read_csv('gold.txt', sep='\t', names=['url', 'category'])
label = pd.read_csv('labels.txt', sep='\t', names=['turk', 'url', 'category'])

In [42]:
gold.loc[0]

url         http://0800-horoscope.com
category                            G
Name: 0, dtype: object

In [43]:
label.loc[0]

turk        A1OT3A29R9N1DG
url          http://000.cc
category                 P
Name: 0, dtype: object

# Q2: Split into two DataFrames

In [44]:
labels_on_gold = label[label['url'].isin(gold['url'])].reset_index(drop = True)
labels_unknown = label[~ label['url'].isin(gold['url'])].reset_index(drop = True)

In [45]:
labels_unknown.loc[:5]

Unnamed: 0,turk,url,category
0,A1OT3A29R9N1DG,http://000.cc,P
1,A1PXXEOGQ76RNJ,http://000.cc,G
2,A1PXXEOGQ76RNJ,http://000.cc,G
3,A21US576U8SCO4,http://000.cc,G
4,A2LGX47NN7C5D3,http://000.cc,G
5,A2OVKTB7VNY8EW,http://000.cc,G


In [46]:
labels_on_gold.loc[:5]

Unnamed: 0,turk,url,category
0,A1253FXHCZ9CWM,http://0800-horoscope.com,G
1,A153PKAL7OAY36,http://0800-horoscope.com,G
2,A1FV9SAPL5C6KY,http://0800-horoscope.com,G
3,A1JTOT0DWM6QGL,http://0800-horoscope.com,G
4,A1PXXEOGQ76RNJ,http://0800-horoscope.com,G
5,A21US576U8SCO4,http://0800-horoscope.com,G


# Q3:  Compute accuracies of turks

In [58]:
#create a df that is intersection of gold and lobels on gold df
merged_df = labels_on_gold.merge(gold,
                                left_on = "url",
                                right_on = "url",
                                suffixes = ["_label", "_gold"])
# use category column to check how many correct answers were achieved
merged_df['is_label_correct'] = merged_df['category_gold'] == merged_df['category_label']

# use groupby to groupby turk. For boolen mean and count can give use required answers
rater_goodness = merged_df.groupby('turk')['is_label_correct'].agg(['mean','count'])

rater_goodness.head()

Unnamed: 0_level_0,mean,count
turk,Unnamed: 1_level_1,Unnamed: 2_level_1
A112DVP1KG4QZU,1.0,1
A1253FXHCZ9CWM,0.517241,29
A12CY1Q7XKJJDE,1.0,1
A12RE8G66WTO8B,0.75,20
A12Y1GTGIQDGRA,0.333333,3


# Q4: Odds ratios

In [60]:
rater_goodness['odds'] = rater_goodness['mean'] / (1.001 - rater_goodness['mean'])

rater_goodness.head()

Unnamed: 0_level_0,mean,count,odds
turk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A112DVP1KG4QZU,1.0,1,1000.0
A1253FXHCZ9CWM,0.517241,29,1.069214
A12CY1Q7XKJJDE,1.0,1,1000.0
A12RE8G66WTO8B,0.75,20,2.988048
A12Y1GTGIQDGRA,0.333333,3,0.499251


# Q5:  Most accurate turks

In [62]:
rater_goodness.sort_values(by="odds", ascending = False)[:10]

Unnamed: 0_level_0,mean,count,odds
turk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A112DVP1KG4QZU,1.0,1,1000.0
A24ULCUOSCOJXC,1.0,1,1000.0
A3BH9WUYDK3LTT,1.0,2,1000.0
A3AY01XG3S0N6G,1.0,3,1000.0
A3A3J87AG178Z,1.0,2,1000.0
A39VY8MOYLYENC,1.0,1,1000.0
A39MWGZ6R4VKZ2,1.0,1,1000.0
A397RDM7QLZEG1,1.0,2,1000.0
A37FYSR72FJ5HV,1.0,1,1000.0
A361QEE8R11VE1,1.0,4,1000.0


# Q6: Rating counts versus accuracy

In [77]:
rater_goodness['level'] = pd.qcut(rater_goodness['odds'], q = 3, labels=['low', 'medium', 'high'])

rater_goodness.groupby('level')['count'].mean()

level
low        7.833333
medium    26.230769
high       2.636364
Name: count, dtype: float64

# Q7: Overall predicted odds

In [140]:
label_unknown_rating = labels_unknown.merge(
                                rater_goodness,
                                how='left',
                                left_on='turk',
                                right_on='turk',
                                suffixes=['_label', '_rater'])



def mul_top_qualite(df, quantile):
    threshold = df['count'].quantile(quantile)
    return df[df['count'] > threshold]['odds'].product()

url_category_conf = label_unknown_rating.groupby(['url', 'category'])[['count', 'odds']].apply(mul_top_qualite, 0.75)
url_category_conf

url                          category
http://0-101.net             G           1.330229
http://000.cc                G           1.330229
                             P           1.000000
http://0000.jp               G           2.779429
http://000relationships.com  G           1.460583
                                           ...   
http://zz868.com             G           1.047852
                             P           1.000000
http://zzx.cc                G           1.000000
                             P           1.000000
                             X           1.000000
Length: 19116, dtype: float64

# Q8: Predicted categories

In [189]:
def best_category(df):
    best_odd = df['odds'].max()
    return df[df['odds'] == best_odd]['category'].iloc[0], best_odd

In [192]:
url_category_conf_df = url_category_conf.reset_index()

url_category_conf_df.columns = ['url', 'category', 'odds']

result_75 = url_category_conf_df.groupby('url')[['category', 'odds']].apply(best_category)

result_75 = pd.DataFrame(list(result_75.values), 
                         index = result_75.index,
                         columns = ['best_category', 'best_odds'])

In [193]:
result_75

Unnamed: 0_level_0,best_category,best_odds
url,Unnamed: 1_level_1,Unnamed: 2_level_1
http://0-101.net,G,1.330229
http://000.cc,G,1.330229
http://0000.jp,G,2.779429
http://000relationships.com,G,1.460583
http://000vitamins.com,G,2.710047
...,...,...
http://zwinky.com,G,2.710047
http://zylom.com,G,1.735526
http://zynga.com,G,2.652501
http://zz868.com,G,1.047852


# Q9: Predicted categories using more turks

In [194]:
url_category_conf_25 = label_unknown_rating.groupby(['url', 'category'])[['count', 'odds']].apply(mul_top_qualite, 0.25)

In [195]:
url_category_conf_25 = url_category_conf_25.reset_index()

url_category_conf_25.columns = ['url', 'category', 'odds']

result_25 = url_category_conf_25.groupby('url')[['category', 'odds']].apply(best_category)

result_25 = pd.DataFrame(list(result_25.values), 
                         index = result_25.index,
                         columns = ['best_category', 'best_odds'])

In [196]:
result_25

Unnamed: 0_level_0,best_category,best_odds
url,Unnamed: 1_level_1,Unnamed: 2_level_1
http://0-101.net,G,1.330229e+00
http://000.cc,G,2.185412e+00
http://0000.jp,G,1.731138e+01
http://000relationships.com,G,2.739423e+01
http://000vitamins.com,G,5.024177e+00
...,...,...
http://zwinky.com,G,1.067895e+02
http://zylom.com,G,2.879092e+00
http://zynga.com,G,2.054419e+01
http://zz868.com,G,1.209566e+10


In [197]:
combined_result = result_25.merge(result_75,
                                 left_on='url',
                                 right_on='url',
                                 suffixes=['_25', '_75'])

In [198]:
combined_result

Unnamed: 0_level_0,best_category_25,best_odds_25,best_category_75,best_odds_75
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
http://0-101.net,G,1.330229e+00,G,1.330229
http://000.cc,G,2.185412e+00,G,1.330229
http://0000.jp,G,1.731138e+01,G,2.779429
http://000relationships.com,G,2.739423e+01,G,1.460583
http://000vitamins.com,G,5.024177e+00,G,2.710047
...,...,...,...,...
http://zwinky.com,G,1.067895e+02,G,2.710047
http://zylom.com,G,2.879092e+00,G,1.735526
http://zynga.com,G,2.054419e+01,G,2.652501
http://zz868.com,G,1.209566e+10,G,1.047852


In [200]:
pd.crosstab(index = combined_result['best_category_25'], columns = combined_result['best_category_75'])

best_category_75,G,P,R,X
best_category_25,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G,8478,363,55,70
P,107,611,18,7
R,22,20,197,27
X,32,5,18,677
