# Count journals

- What are the patterns for each representative conferences?  
- How many entries should I manually label, for some representative conferences?

I submitted [an issue](https://github.com/allenai/s2orc/issues/26) to ask if the S2ORC authors plan to unify the conference names in the future releases.

In [8]:
import json
import numpy as np
import pandas as pd
import os, sys, time
import re
from pprint import pprint

from utils import timed_func

with open(os.path.join("../scripts/20200718_preprocess", "journals_count.json"), "r") as f:
    journals = json.loads(f.read())
    
export_data_dir = "0826_data"
if not os.path.exists(export_data_dir):
    os.makedirs(export_data_dir)
    
journals.keys()

dict_keys(['Economics', 'Biology', 'Geography', 'Philosophy', 'Materials Science', 'Business', 'Mathematics', 'Chemistry', 'Medicine', 'Psychology', 'Geology', 'Political Science', 'Engineering', 'Art', 'Physics', 'Computer Science', 'Environmental Science', 'Sociology', 'History'])

In [2]:
cs = journals["Computer Science"]
len(cs)

187903

In [3]:
# Rank the journals by num publications
def most_popular_journals(catname, num=10, min_count=10):
    category = journals[catname]
    L = [(k,category[k]) for k in category if category[k]>min_count]
    L.sort(key=lambda item: item[1], reverse=True)
    print("{} venues contain more than {} items".format(len(L), min_count))
    pprint(L[:num])
    
most_popular_journals("Computer Science", 10)

34534 venues contain more than 10 items
[('None', 385185),
 ('ArXiv', 103996),
 ('IEEE Access', 29722),
 ('Acta Crystallographica Section E: Structure Reports Online', 15444),
 ('International Journal of Computer Applications', 12825),
 ('INTERSPEECH', 10952),
 ('CACM', 10238),
 ('Applied Mechanics and Materials', 9476),
 ('Bioinformatics', 9261),
 ('Multimedia Tools and Applications', 8943)]


In [67]:
def data_to_df(category, export, min_count):
    export_dfdata = {"venue": [], "count": [], "label": []} 
    for name in cs:
        export_dfdata["venue"].append(name)
        export_dfdata["count"].append(category[name])
        export_dfdata["label"].append("")
    df = pd.DataFrame(export_dfdata).sort_values(by="count", ascending=False)
    total_articles = df["count"].sum()
    print ("Total articles: {} (from {} venues)".format(
        total_articles, len(df)
    ))
    df = df[df["count"] >= min_count]
    considered_articles = df["count"].sum()
    print ("{} articles with more than {} papers: ({:.2f}%, from {} venues)".format(
        considered_articles, min_count, 
        considered_articles / total_articles * 100, len(df)
    ))
    if export:
        df.to_csv(os.path.join(export_data_dir, "journal_counts_all.csv"), index=False)
    return df

df = data_to_df(cs, export=True, min_count=5)
df.head()

Total articles: 4305658 (from 187903 venues)
4131634 articles with more than 5 papers: (95.96%, from 46817 venues)


Unnamed: 0,venue,count,label
21,,385185,
14,ArXiv,103996,
164,IEEE Access,29722,
99,Acta Crystallographica Section E: Structure Re...,15444,
502,International Journal of Computer Applications,12825,


In [70]:
rules_cl = [
    lambda name: "EMNLP" in name or "empirical methods" in name.lower(),
    lambda name: "ACL" in name,
    lambda name: "conll" in name.lower(),
    lambda name: "ling" in name.lower(),
    lambda name: "language" in name.lower(),
    lambda name: "LREC" in name,
    lambda name: "HLT" in name,
    lambda name: "IJCNLP" in name,
    lambda name: "SIGDIAL" in name,
]
rules_speech = [
    lambda name: "ICASSP" in name,
    lambda name: "speech" in name.lower(),
    lambda name: "SLT" in name,
]
rules_ml = [
    lambda name: "NIPS" in name,
    lambda name: "neural" in name.lower(),
    lambda name: "neural network" in name.lower(),
    lambda name: "ICLR" in name,
    lambda name: "ICML" in name,
    lambda name: "learn" in name.lower(),
    lambda name: "COLT" in name,
]
rules_ai = [
    lambda name: "AI" in name,
    lambda name: "artificial" in name.lower(),
    lambda name: "intelligence" in name.lower(),
    lambda name: "fuzzy" in name.lower(),
    lambda name: "knowledge" in name.lower(),
    lambda name: "soft comp" in name.lower(),
    lambda name: "neurocomp" in name.lower(),
]
rules_cv = [
    lambda name: "CVPR" in name,
    lambda name: "vision" in name.lower(),
    lambda name: "pattern" in name.lower(),
    lambda name: "recognition" in name.lower(),
    lambda name: "image" in name.lower(),
    lambda name: "ICIP" in name,
    lambda name: "ECCV" in name,
    lambda name: "ICCV" in name,
    lambda name: "BMVC" in name,
]
rules_robo = [
    lambda name: "robotic" in name.lower(),
    lambda name: "ICRA" in name,
    lambda name: "RSS" in name,
    lambda name: "automat" in name.lower(),
]
        
def find_AI_venues(df):
    all_rules = {
        "NLP": rules_cl, "Speech": rules_speech, "ML": rules_ml, "AI": rules_ai, "CV": rules_cv, "Robo": rules_robo
    }
    for rulename in all_rules:
        df[rulename] = np.array([False] * len(df))
        rules = all_rules[rulename]
        for rule in rules:
            df[rulename] = np.logical_or(
                df[rulename].values, 
                np.array([rule(name) for name in df["venue"]])
            )
        print (f"Category: {rulename}")
        df_cat = df[df[rulename]==True]
        print ("    Venues included: {}".format(len(df_cat)))
        paper_covered = df_cat["count"].sum()
        total_paper = df["count"].sum()
        print("    Paper covered: {} of {} ({:.2f}%)".format(
            paper_covered, total_paper, paper_covered / total_paper * 100))
        
    df_ai = df[df["NLP"] | df["Speech"] | df["ML"] | df["AI"] | df["CV"] | df["Robo"]]
    return df_ai

df_ai = find_AI_venues(df)
print(df_ai.shape)
df_ai.to_csv(os.path.join(export_data_dir, "df_ai.csv"), index=False)

Category: NLP
    Venues included: 1566
    Paper covered: 77934 of 4131634 (1.89%)
Category: Speech
    Venues included: 198
    Paper covered: 54862 of 4131634 (1.33%)
Category: ML
    Venues included: 765
    Paper covered: 72083 of 4131634 (1.74%)
Category: AI
    Venues included: 2437
    Paper covered: 183116 of 4131634 (4.43%)
Category: CV
    Venues included: 996
    Paper covered: 132429 of 4131634 (3.21%)
Category: Robo
    Venues included: 1149
    Paper covered: 125986 of 4131634 (3.05%)
(6681, 9)


In [71]:
print ("Total papers covered: {}".format(df_ai["count"].sum()))

Total papers covered: 614874
