In [1]:
from pandas import Series, DataFrame
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
%pylab inline

Populating the interactive namespace from numpy and matplotlib


### [Q1 10 points] Read in data

In [2]:
gold = pd.read_csv('gold.txt', sep='\t', names=['url', 'category']).drop_duplicates()
label = pd.read_csv('labels.txt', sep='\t', names=['turk', 'url', 'category']).drop_duplicates()

### [Q2 10 points] Split into two DataFrames

In [3]:
mask = label['url'].isin(gold['url'])
labels_on_gold = label[mask]
labels_unknown = label[-mask]

### [Q3 10 points] Compute accuracies of turks

In [4]:
all_rate = labels_on_gold.merge(gold, left_on = 'url', right_on = 'url', suffixes = ['_lab', '_gold'])
all_rate['correct'] = all_rate['category_lab'] == all_rate['category_gold']
rater_goodness = all_rate.groupby('turk')['correct'].agg(['mean', 'count'])
rater_goodness.iloc[:10]

Unnamed: 0_level_0,mean,count
turk,Unnamed: 1_level_1,Unnamed: 2_level_1
A112DVP1KG4QZU,1.0,1
A1253FXHCZ9CWM,0.517241,29
A12CY1Q7XKJJDE,1.0,1
A12RE8G66WTO8B,0.75,20
A12Y1GTGIQDGRA,0.333333,3
A13CEW9JGDWGX1,1.0,1
A13OE9GBRJ0S2U,0.75,4
A14IQ4GLNWNPOJ,1.0,1
A153PKAL7OAY36,0.722973,148
A1554ZM0CLKSG5,1.0,1


### [Q4 10 points] Odds ratios
If someone is correct p fraction of the time, the odds of success are defined
as:
Attach a column called odds to the rater goodness DataFrame, using the
average correctness of the turk as his or her p.

In [5]:
rater_goodness['odds'] = rater_goodness['mean'] / (1.001 - rater_goodness['mean'])
rater_goodness.iloc[:10]

Unnamed: 0_level_0,mean,count,odds
turk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A112DVP1KG4QZU,1.0,1,1000.0
A1253FXHCZ9CWM,0.517241,29,1.069214
A12CY1Q7XKJJDE,1.0,1,1000.0
A12RE8G66WTO8B,0.75,20,2.988048
A12Y1GTGIQDGRA,0.333333,3,0.499251
A13CEW9JGDWGX1,1.0,1,1000.0
A13OE9GBRJ0S2U,0.75,4,2.988048
A14IQ4GLNWNPOJ,1.0,1,1000.0
A153PKAL7OAY36,0.722973,148,2.600369
A1554ZM0CLKSG5,1.0,1,1000.0


### [Q5 10 points] Most accurate turks

In [6]:
mask = rater_goodness['count'] >= 20
rater_goodness[mask].sort_values(by = 'mean', ascending = False).iloc[:10]

Unnamed: 0_level_0,mean,count,odds
turk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A2U0R4X38GUKZE,0.95,20,18.627451
A22C0PJUBFJTI0,0.916667,36,10.869565
A23YQUBXZPKILZ,0.875,24,6.944444
ATVALOQVDCMZW,0.854369,103,5.826657
A1HIXWH4OXT8S4,0.825,40,4.6875
A3220HG1O83HQ4,0.818182,22,4.475385
A32W20KGQXS0LL,0.8,25,3.9801
A20PWAB7G3HDHU,0.8,20,3.9801
AJSJVK40F5HM6,0.785714,28,3.649635
A31OCN4MNHUQ6W,0.777174,184,3.472222


### [Q6 10 points] Rating counts versus accuracy

In [7]:
y, X = dmatrices('mean ~ count', data = rater_goodness, return_type = 'dataframe')
model = sm.OLS(y, X)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,mean,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.5414
Date:,"Mon, 02 Aug 2021",Prob (F-statistic):,0.463
Time:,13:24:07,Log-Likelihood:,-94.561
No. Observations:,269,AIC:,193.1
Df Residuals:,267,BIC:,200.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6438,0.024,27.115,0.000,0.597,0.691
count,0.0007,0.001,0.736,0.463,-0.001,0.002

0,1,2,3
Omnibus:,29.163,Durbin-Watson:,2.068
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.684
Skew:,-0.738,Prob(JB):,5.91e-07
Kurtosis:,2.382,Cond. No.,30.1


The number of ratings does not have a significant effect the accuracy of the rating (p > .05).

### [Q7 13 points] Overall predicted odds
Consider each url u that is not in the gold set, and each category c. For the pair (u, c), calculate the product of odds of all turks who (a) rated url u as category c, and (b) have rated more gold set urls than 75% of all turks who rated at least one gold-set url.
For example, if you find that there are 269 turks who rated at least one gold-set url, you want to select only the turks who have rated more gold-set urls than 75% of these 269 turks. We can think of these as our “reliable” turks. Now, our belief that url u belongs to category c depends on how many “reliable” turks rated u as c; specifically, our belief is based on the product of their reliability scores (i.e., their odds).
We shall call such products of odds the overall odds henceforth

In [80]:
# get count of ratings from > 75%
greater = rater_goodness['count'].quantile(q = .75)

# mask if grater than 75%
mask = labels_unknown.groupby('turk')['url'].agg('count') > greater
counts = labels_unknown.groupby('turk')['url'].agg('count')[mask]
reliable = labels_unknown[labels_unknown['turk'].isin(counts.index)]

# merge with rater goodness
unknown = reliable.merge(rater_goodness, left_on = 'turk', right_on = 'turk')

# calculate product of odds
unknown = pd.pivot_table(unknown, index = "url", columns = "category", values = "odds", aggfunc = "prod")

unknown

category,G,P,R,X
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
http://0-101.net,2.297602e+00,,,
http://000.cc,2.218192e+03,0.998004,,
http://0000.jp,2.847963e+07,,,
http://000relationships.com,0.000000e+00,1.851852,0.332889,
http://000vitamins.com,5.343559e+03,,,
...,...,...,...,...
http://zwinky.com,2.264765e+02,1000.000000,,
http://zylom.com,3.062113e+06,,,
http://zynga.com,5.386158e+01,,,
http://zz868.com,1.209566e+16,1.851852,,


### [Q8 13 points] Predicted categories

In [81]:
result_75 = DataFrame(index = unknown.index)
result_75['top category'] = unknown.T.idxmax()
result_75['top odds'] = unknown.max(axis = 1)
result_75.iloc[:10]

Unnamed: 0_level_0,top category,top odds
url,Unnamed: 1_level_1,Unnamed: 2_level_1
http://0-101.net,G,2.297602
http://000.cc,G,2218.192
http://0000.jp,G,28479630.0
http://000relationships.com,P,1.851852
http://000vitamins.com,G,5343.559
http://000webhost.com,G,11868900000000.0
http://003my.com,G,4.861524
http://007absolutehardcore.com/early_cumshots/index.html,X,584.7515
http://007swz.cn,G,1.482491
http://01768.com,G,1.485456


### [Q9 14 points] Predicted categories using more turks
Questions 7 and 8 above only considered the ratings of turks who had rated enough gold set URLs, so we were relatively more confident about their accuracies. What happens if we loosen this restriction?
Repeat the code of Q7 and Q8, but replacing 75% by 25% in the descrip- tion of Q7 (i.e., we also consider turks who have far fewer gold set ratings). Call this result 25.
Now let’s see how these two results compare. Create a DataFrame where both the index and the columns are the various categories, and the cells
contain the number of urls with these as the top categories according to result 75 and result 25.
For example, the cell corresponding to the row category=R and the col- umn category=G would be the number of URLs that were predicted to be R by result 75 but predicted to be G by result 25.
Where are the most errors?

In [106]:
# get count of ratings from > 25%
greater = rater_goodness['count'].quantile(q = .25)

# mask if grater than 75%
mask = labels_unknown.groupby('turk')['url'].agg('count') > greater
counts = labels_unknown.groupby('turk')['url'].agg('count')[mask]
reliable = labels_unknown[labels_unknown['turk'].isin(counts.index)]

# merge with rater goodness
unknown = reliable.merge(rater_goodness, left_on = 'turk', right_on = 'turk')

# calculate product of odds
unknown = pd.pivot_table(unknown, index = "url", columns = "category", values = "odds", aggfunc = "prod")

result_25 = DataFrame(index = unknown.index)
result_25['top category'] = unknown.T.idxmax()
result_25['top odds'] = unknown.max(axis = 1)

all_cats = result_25.join(result_75, lsuffix = '_25', rsuffix = '_75')

pd.crosstab(index = all_cats['top category_25'], columns = all_cats['top category_75'])

top category_75,G,P,R,X
top category_25,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G,8279,7,1,0
P,9,1162,0,0
R,3,3,499,2
X,2,0,0,734
