# Undis_List_Generation_for_FastText

This notebook is written to create a list of undismissed researchers extracted from database **author_df_nodup_aff.csv**. The list is used for the generation of feature by FastText.

In [1]:
# Math packages
import math
import numpy as np
import pandas as pd

# Other packages
import re
import time
import string
import pickle
import collections
import gensim
import warnings; warnings.simplefilter('ignore')

In [2]:
data_dismiss = pd.read_csv('./data/dismissed_complete.csv')
data_dismiss = data_dismiss.rename(columns = {'subheadind':'subheading'})

data_dismiss.drop_duplicates(inplace=True)
data_dismiss.dropna(subset=['keywords'],inplace=True)

In [3]:
searchnames_dismiss_univ = []
searchnames_dismiss_fullname = []
for line in open('./data/Dismis_Acad_List_CLEAN.txt','r'): 
    QueryString = line.strip()
    QueryString_aff = QueryString.split(',')[1]
    QueryString_fullname = QueryString.split(',')[0].split()[-1]+' '+' '.join(QueryString.split(',')[0].split()[0:-1])
    searchnames_dismiss_fullname.append(QueryString_fullname)
    searchnames_dismiss_univ.append(QueryString_aff)

In [4]:
auth_aff_df = pd.DataFrame({'Author':searchnames_dismiss_fullname,
                            'Affiliation':searchnames_dismiss_univ})
auth_names = pd.read_csv('./data/author_df_nodup_aff.csv')
auth_names = auth_names[['Author','AbbrNames','title','year','journal','keyword','uid','Affiliation']]

In [5]:
def FindAff(paper_df,Abbrname,idx):
    tmp_dict = eval(paper_df.iloc[idx].name_info)
    tmp_dict = tmp_dict[Abbrname+'*']
    if tmp_dict.__contains__('address'):
        affliation = tmp_dict['address']['organization']
        if affliation == None:
            affliation = affliation
        else:
            affliation = affliation.upper()
    elif tmp_dict.__contains__('addr_no_1') & ~tmp_dict.__contains__('addr_no_2'):
        affliation = tmp_dict['addr_no_1']['organization']
        if affliation == None:
            affliation = affliation
        else:
            affliation = affliation.upper()
    else:
        affliation = None
    return affliation


def FindDispName(paper_df,Abbrname,idx):
    tmp_dict = eval(paper_df.iloc[idx].name_info)
    tmp_dict = tmp_dict[Abbrname+'*']
    if tmp_dict.__contains__('display_name'):
        disp_name = ' ' .join(list(filter(None,re.split(r"\W+",tmp_dict['display_name']))))
        disp_name = disp_name.upper()
    else:
        disp_name = None
    seq_no = int(tmp_dict['seq_no']) if tmp_dict['seq_no'] else None
    daisng_id = int(tmp_dict['daisng_id']) if tmp_dict['daisng_id'] else None
    return disp_name,seq_no,daisng_id


def FindHead (paper_df,idx):
    try:
        heading_tmp = eval(paper_df.iloc[idx].subheading)
        heading = []
        for head in heading_tmp:
            heading+=(list(filter(None,re.split(r"\W+",head.lower()))))
    except:
        heading = None
    return heading

def FindSub (paper_df,idx):
    try:
        subject_tmp = eval(paper_df.iloc[idx].traditional_sub)
        subject = []
        for sub in subject_tmp:
            subject+=(list(filter(None,re.split(r"\W+",sub.lower()))))
    except:
        subject = None
    return subject


def AvgSubVec(sublist,model):
    sub2vec = 0
    for sub in sublist:
        sub2vec += model[sub.lower()]
    return sub2vec/len(sublist)


def paper_clean(paper_df,Abbrname,NumCluster):
    paper_vec_df = pd.DataFrame(columns=['Title','Disp_name','Aff','Sub','seq_no','daisng_id','publish_date','keywords'])
    for i in range(len(paper_df)):
        Title = paper_df.iloc[i].title
        disp_name,seq_no,daisng_id = FindDispName(paper_df,Abbrname,i)
        # publish date is added to the frame
        pudate = paper_df.iloc[i].publish_date
        # keywords added to the frame
        keywords = paper_df.iloc[i].keywords
        # if no pre-known affiliation then try to find it 
        Aff = FindAff(paper_df,Abbrname,i)
        # if the subject from dataframe exists then try to split it
        Heading = None if pd.isnull(paper_df.iloc[i].heading) else FindHead(paper_df,i)
        Sub = None if pd.isnull(paper_df.iloc[i].traditional_sub) else FindSub(paper_df,i)
        if Heading:
            if Sub:
                Heading_Sub = Heading + Sub
            else:
                Heading_Sub = None
        else:
            Heading_Sub = None
        test = pd.DataFrame({'Title':[Title],'Disp_name':[disp_name],'Aff':[Aff],'Sub':[Sub],\
                             'seq_no':seq_no,'daisng_id':daisng_id,'publish_date': pudate,'keywords': keywords})
        try:
            tmp_kw = eval(paper_df.iloc[i].keywords)
        except:
            paper_vec_df = paper_vec_df.append(test)
            continue
        else:
            paper_vec_df = paper_vec_df.append(test) 
    paper_vec_df = paper_vec_df.reset_index(drop=True)
    return paper_vec_df

In [6]:
for [j,Abbrname] in enumerate(auth_names.AbbrNames.unique()):
    MatchName = '\''+Abbrname+'\*'
    fullname_search = auth_names.loc[auth_names.AbbrNames.str.contains(Abbrname)][['Author','Affiliation','title','uid','keyword','year']].copy()
    NumCluster = fullname_search.Author.unique().shape[0]
    auth2cluster = fullname_search.Author.unique()
    data_to_cluster = data_dismiss.loc[data_dismiss.AbbrNames.str.contains(MatchName)].copy()
    data_to_cluster = data_to_cluster.merge(fullname_search.drop(columns=['title','keyword']),on='uid',how='outer')
    data_to_cluster.dropna(subset=['uid'],inplace=True)
    data_to_cluster.drop_duplicates(subset=['uid'],inplace=True)
    data_to_cluster = data_to_cluster.dropna(subset=['keywords'])
    data_to_cluster = data_to_cluster.reset_index(drop=True)
    paper_vec = paper_clean(data_to_cluster, Abbrname, NumCluster)
    
    # HERE PUBLICATIONS BY AUTHORS FROM FOREIGN INSTITUTES ARE REMOVED
    for [i,uni] in enumerate(auth_aff_df['Affiliation'].unique()):
        if i == 0:
            paper_vec_tr = paper_vec.loc[paper_vec['Aff'].values == 'ISTANBUL UNIV']
        else:
            paper_vec_tr = pd.concat([paper_vec_tr,paper_vec.loc[paper_vec['Aff'].values == uni[:-6]]])
    paper_vec = paper_vec_tr
    # HERE paper_vec_new_tr REPRESENTS PUBLICATIONS AFTER 2016-01-01
    paper_vec_new_tr = paper_vec.loc[paper_vec['publish_date']>'2016-01-01'].copy()
    
    # HERE PUBLICATIONS BY AMBIGUOUS AUTHORS ARE REMOVED
    paper_rm = paper_vec_new_tr.loc[paper_vec_new_tr['Disp_name'].str.contains(Abbrname+' ')]
    paper_vec_new_tr = paper_vec_new_tr.drop(index = paper_rm.index,errors = 'ignore')
    paper_rm = paper_vec_new_tr.loc[paper_vec_new_tr['Disp_name'].values == Abbrname]
    paper_vec_new_tr = paper_vec_new_tr.drop(index = paper_rm.index,errors = 'ignore')
    
    # HERE PUBLICATIONS BY DISMISSED AUTHORS ARE REMOVED
    for cluster_auth in auth2cluster:
        paper_rm = paper_vec_new_tr.loc[paper_vec_new_tr['Disp_name'].str.contains(cluster_auth)]
        paper_vec_new_tr = paper_vec_new_tr.drop(index = paper_rm.index,errors = 'ignore')
        
    # HERE y_new IS PUBLICATIONS BY AUTHORS WITH MORE THAN 5 RECORDS
    x = paper_vec_new_tr.groupby('Disp_name').count()[['Aff']]
    x.columns=['count']
    x.reset_index(inplace=True)
    y = paper_vec_new_tr.merge(x, on='Disp_name', how='outer')
    y_new = y.sort_values(by = 'count')
    y_new = y_new.loc[y['count']>5]
    
    # HERE no_dis_pub IS PUBLICATIONS BY AUTHORS SELECTED
    for [i,writer] in enumerate(y_new['Disp_name'].unique()):
        if i == 0:
            no_dis_pub = paper_vec.loc[paper_vec['Disp_name'] == writer].copy()
        else:    
            no_dis_pub = pd.concat([no_dis_pub,paper_vec.loc[paper_vec['Disp_name'] == writer].copy()])
    if j == 0 :
        total_no_dis_pub = no_dis_pub
    else :   
        total_no_dis_pub = pd.concat([total_no_dis_pub, no_dis_pub])

In [7]:
total_no_dis_pub = pd.read_csv('./data/total_no_dis_pub_2.csv')