In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import nltk
import random
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression


In [None]:
df = pd.read_csv("tolabel.csv", sep="|")
df = df[["Manuscript no.", "Reviewer ID", "CleanedComments", "Rec", "Suitable", "ShouldBe", "HumanLabel"]]
df = df.set_index(["Manuscript no."])
scored_bert = pd.read_csv("metascience/PeerRead/bert_output_20191104/eval_results_full_allelife.txt", 
                          sep="\t", names=["id", "score", "dummy", "text"])

list(scored_bert.sort_values(by="score", ascending=False).iloc[1:10,]["text"])
df["score"] = list(scored_bert.score)
df["Text"] = list(scored_bert.text)

In [None]:
reviewers = pd.read_csv("gender_reviewers.csv", error_bad_lines=False)
# this is wrong
reviewers_data = pd.DataFrame(reviewers.groupby("Reviewer ID")["Reviewer name"].count())
reviewers_data.columns = ["reviewer_count"]

In [None]:
reviewers["review_count"] = reviewers.groupby("Reviewer ID")["gender"].transform("count")

In [None]:
domain = []
for i in reviewers["Reviewer email"].str.split("."):
    try:
        domain += [i[-1]]
    except TypeError:
        domain += [""]
reviewers["domain"] = domain

In [None]:
# read in paper history stuff
e = pd.read_csv("/share/pi/dmcfarla/eLifeRawData/DM_Data_Reviews/DM_Data/eLife_Paper_history_2019_03_15.csv")
e["Manuscript no."] = e["ms"]
e = e.set_index(["Manuscript no."])
e = e.dropna(subset=["full_decision"])

# to get finaldecision, take last non-NA decision of the ones listed here
# note that this excludes rejected by initial decision
e["FinalDecision"] = e.apply(lambda x: list(x[["full_decision", "rev1_decision", "rev2_decision", "rev3_decision", "rev4_decision"]].dropna())[-1], axis=1)
e["outcome"] = np.where(e["FinalDecision"] == "Accept Full Submission", 1, 0)


In [None]:
df_e = df.join(e)
#df_e = df_e.set_index(["ms"])
#df_e = df.reset_index()
#df_e = df_e.merge(reviewers, on=["Reviewer ID"])
#df_e = df_e.set_index(df_e["Manuscript no."])

In [None]:
df_e.groupby(["outcome"]).mean()["score"]

In [None]:
countries = pd.DataFrame(df_e.groupby(["country"]).count()["Rec"])
countries["mean_score"] = df_e.groupby(["country"]).mean()["score"]
countries["mean_outcome"] = df_e.groupby(["country"]).mean()["outcome"]

In [None]:
countries.loc[countries["Rec"] > 300, "mean_score"].sort_values()

In [None]:
countries.loc[countries["Rec"] > 300, "mean_outcome"].sort_values()

In [None]:
labeled = df.loc[~pd.isna(df.HumanLabel)]
labeled = labeled.loc[labeled.HumanLabel <= 5]
labeled[["score", "HumanLabel"]].corr()

%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(labeled.HumanLabel, labeled.score)
labeled.HumanLabel.value_counts()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.hist(labeled.score)

In [None]:
plt.hist(labeled.HumanLabel)

In [None]:
df_e["review_outcome"] = "none"

In [None]:
df_e["zscore"] = (df_e.score - np.mean(df_e.score))/np.std(df_e.score)

In [None]:
df_e.loc[(df_e.zscore > 1) & (df_e.outcome == 1), "review_outcome" ] = "pos_pos"
df_e.loc[(df_e.zscore > 1) & (df_e.outcome == 0), "review_outcome" ] = "pos_neg"
df_e.loc[(df_e.zscore < -1) & (df_e.outcome == 0), "review_outcome" ] = "neg_neg"
df_e.loc[(df_e.zscore < -1) & (df_e.outcome == 1), "review_outcome" ] = "neg_pos"


In [None]:
df_e.review_outcome.value_counts()

In [None]:
# papers with disagreement
disagreement = df_e.loc[(df_e.review_outcome == "pos_neg") | (df_e.review_outcome == "neg_pos")]

In [None]:
disagreement_papers = df_e.loc[set(disagreement.index)]

In [None]:
def get_example_disagreement():
    ex = disagreement_papers.loc[random.choice(disagreement_papers.index)]
    ex = (ex[["CleanedComments", "score", "outcome"]])
    print("outcome:", list(ex["outcome"])[0])
    for i in range(ex.shape[0]):
        print(ex.iloc[i]["score"], ex.iloc[i]["CleanedComments"] + "\n")
    
get_example_disagreement()    

In [None]:
disagreement_papers["winner"] = np.where(list((disagreement_papers.review_outcome == "neg_neg") | (disagreement_papers.review_outcome == "pos_pos")), "winner", "none")
disagreement_papers["winner"] = np.where(list((disagreement_papers.review_outcome == "neg_pos") | (disagreement_papers.review_outcome == "pos_neg")), "loser", disagreement_papers.winner)

In [None]:
disagreement_papers.review_outcome.value_counts()

In [None]:
disagreement_papers["length_text"] = (disagreement_papers["CleanedComments"].str.len())

In [None]:
disagreement_papers.groupby("winner").mean()["length_text"]

In [None]:
winners_losers = disagreement_papers.loc[(disagreement_papers.winner == "winner") | (disagreement_papers.winner == "loser")]

In [None]:
# make a classifier that predicts outcome based on review
x = winners_losers
x.initial_qc_dt = pd.to_datetime(x.initial_qc_dt)
train = x[pd.to_datetime(x.initial_qc_dt) <= pd.to_datetime("2017-06-30")]
test = x[((x.initial_qc_dt > pd.to_datetime("2017-06-30")) & (x.initial_qc_dt < pd.to_datetime("2018-01-01")))]
word_vectorizer = CountVectorizer(analyzer='word')
tags = [i == "winner" for i in train["winner"]]
test_tags = [i == "winner" for i in test["winner"]]

vectorizer = TfidfVectorizer(min_df=20, max_df=0.8, ngram_range=(1, 2),
                             stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(list(train["CleanedComments"]))
test_processed_features = vectorizer.transform(list(test["CleanedComments"]))

text_classifier = LogisticRegression()
text_classifier.fit(processed_features, tags)
predictions = text_classifier.predict(processed_features)

# in-sample
np.mean(predictions == tags)



In [None]:
# print top10 features, bottom10
def print_top10(vectorizer, clf):
    feature_names = vectorizer.get_feature_names()
    print ("top 10")
    for j in reversed(np.argsort(clf.coef_[0])[-10:]):
        print (feature_names[j])
    print ("\nbottom 10")
    for j in np.argsort(clf.coef_[0])[0:10]:
        print (feature_names[j])

print_top10(vectorizer, text_classifier)        

In [None]:
disagreement_papers.columns

In [None]:
review_dis = disagreement_papers.reset_index().merge(reviewers, on=["Manuscript no.", "Reviewer ID"])

In [None]:
review_dis.groupby("winner").mean()["review_count"]

In [None]:
review_dis["gender_binary"] = review_dis["gender"].str.contains("female")

In [None]:
review_dis.groupby("winner").mean()["gender_binary"]

In [None]:
review_dis.gender_binary.mean()

In [None]:
c = review_dis.groupby(["domain", "winner"]).count()["Manuscript no."]
sums = c.groupby(level=0).sum().reset_index()
keepers = sums.loc[sums["Manuscript no."] > 100]["domain"]
c = c.loc[keepers]
sums = c.groupby(level=0).sum()
review_domain_winners = c/sums
review_domain_winners = review_domain_winners.reset_index()
review_domain_winners.loc[review_domain_winners.winner == "winner"].sort_values("Manuscript no.")