In [32]:
import re
import os 
import glob
import pandas as pd 

os.chdir('/home/timothyelder/Documents')

path = r'/home/timothyelder/Documents/data/matches'

In [33]:
all_files = glob.glob(path + "/*.csv")

df_from_each_file = (pd.read_csv(f, sep=',') for f in all_files) # generate a list of files to concatenate

df_merged = pd.concat(df_from_each_file, ignore_index=True) # Concatenate pandas dataframes

# Saving Merged dataframe
df_merged.to_csv("/home/timothyelder/Documents/data/fuzzy_matches.csv")

## Normalize Faculty Names 

We didn't remove the periods ('.') from the faculty names in the network data and there are a lot of exact matches when you do

In [34]:
faculty_names = df_merged.network_name.to_list()

# for index and name in faculty names
for idx,i in enumerate(faculty_names):
    i = re.sub(r'\.', '', i) # replace semi-colons with commas
    faculty_names[idx] = i # substitute original name with normalized name
        
df_merged.network_name = faculty_names

### Filter and Concatenate to new Dataframe 

1. We will take the exact matches and assign them to the new dataframe, dropping them from the original. 
2. Then we will reorder the dataframe based on the best match score
3. Then decide some threshold to drop matches. 

*Note*: There are 8052 unique names in the network data. So at a minimum, there should be at least 8052 names or IDs in the publication data to have 100% coverage. Likely there will be more because there will be multiple IDs assigned to the same name.

In [35]:
# Exact matches
df = df_merged[df_merged['network_name'] == df_merged['NormalizedName']]
df

Unnamed: 0.1,Unnamed: 0,best_match_score,__id_left,__id_right,faculty_name,network_name,AuthorId,NormalizedName
65,2990,0.659706,74_left,1571423_right,"Blankenship, Kim M.",kim m blankenship,3.048999e+09,kim m blankenship
295,13260,0.712993,325_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
302,13550,0.605695,332_left,310220_right,"Smith, David H.",david h smith,3.043000e+09,david h smith
308,13798,0.712993,338_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
343,15578,0.712993,378_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
...,...,...,...,...,...,...,...,...
927493,40596,0.696242,1108_left,529484_right,"Hoffmann, John P.",john p hoffmann,3.066553e+09,john p hoffmann
927534,42066,0.696242,1151_left,529484_right,"Hoffmann, John P.",john p hoffmann,3.066553e+09,john p hoffmann
927560,42944,0.696242,1178_left,529484_right,"Hoffmann, John P.",john p hoffmann,3.066553e+09,john p hoffmann
928920,95593,0.824004,2574_left,1468785_right,"Gans, Herbert J.",herbert j gans,3.067650e+09,herbert j gans


In [36]:
df = df.append(df_merged[df_merged['best_match_score'] >= .4])
df

Unnamed: 0.1,Unnamed: 0,best_match_score,__id_left,__id_right,faculty_name,network_name,AuthorId,NormalizedName
65,2990,0.659706,74_left,1571423_right,"Blankenship, Kim M.",kim m blankenship,3.048999e+09,kim m blankenship
295,13260,0.712993,325_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
302,13550,0.605695,332_left,310220_right,"Smith, David H.",david h smith,3.043000e+09,david h smith
308,13798,0.712993,338_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
343,15578,0.712993,378_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
...,...,...,...,...,...,...,...,...
950889,910798,0.463892,25229_left,349390_right,"Gerber, Theodore P.",theodore p gerber,3.066343e+09,p gerber
950915,911705,0.463892,25255_left,349390_right,"Gerber, Theodore P.",theodore p gerber,3.066343e+09,p gerber
950926,912139,0.451456,25266_left,215925_right,"Schwartz, Christine R.",christine r schwartz,3.066187e+09,r schwartz
950938,912679,0.463892,25278_left,349390_right,"Gerber, Theodore P.",theodore p gerber,3.066343e+09,p gerber


In [37]:
len(set(df.network_name.to_list()))

5498

In [39]:
df

Unnamed: 0.1,Unnamed: 0,best_match_score,__id_left,__id_right,faculty_name,network_name,AuthorId,NormalizedName
65,2990,0.659706,74_left,1571423_right,"Blankenship, Kim M.",kim m blankenship,3.048999e+09,kim m blankenship
295,13260,0.712993,325_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
302,13550,0.605695,332_left,310220_right,"Smith, David H.",david h smith,3.043000e+09,david h smith
308,13798,0.712993,338_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
343,15578,0.712993,378_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
...,...,...,...,...,...,...,...,...
950889,910798,0.463892,25229_left,349390_right,"Gerber, Theodore P.",theodore p gerber,3.066343e+09,p gerber
950915,911705,0.463892,25255_left,349390_right,"Gerber, Theodore P.",theodore p gerber,3.066343e+09,p gerber
950926,912139,0.451456,25266_left,215925_right,"Schwartz, Christine R.",christine r schwartz,3.066187e+09,r schwartz
950938,912679,0.463892,25278_left,349390_right,"Gerber, Theodore P.",theodore p gerber,3.066343e+09,p gerber


In [39]:

df_filtered = df_filtered.sort_values("best_match_score", ascending=False)
df_filtered = df_filtered[df_filtered['best_match_score'] >= .5]
df_filtered = df_filtered[df_filtered['best_match_score'] <= 1]
df_filtered

Unnamed: 0.1,Unnamed: 0,best_match_score,__id_left,__id_right,faculty_name,network_name,AuthorId,NormalizedName
193760,538928,0.999079,11916_left,1352093_right,"Wynveen, Brooklynn A.",brooklynn a wynveen,2.769996e+09,brooklynn j wynveen
574323,92581,0.997364,2196_left,1732908_right,"Cohen, Joseph Nathan",joseph nathan cohen,3.180070e+09,nathan joseph cohen
574303,91520,0.997364,2174_left,1732908_right,"Cohen, Joseph Nathan",joseph nathan cohen,3.180070e+09,nathan joseph cohen
574130,84171,0.997364,1977_left,1732908_right,"Cohen, Joseph Nathan",joseph nathan cohen,3.180070e+09,nathan joseph cohen
574267,89836,0.997364,2133_left,1732908_right,"Cohen, Joseph Nathan",joseph nathan cohen,3.180070e+09,nathan joseph cohen
...,...,...,...,...,...,...,...,...
502993,103089,0.500036,2452_left,1810972_right,"Robinson, Kenneth",kenneth robinson,2.570665e+09,kenneth j robinson
502959,101436,0.500036,2409_left,1810972_right,"Robinson, Kenneth",kenneth robinson,2.570665e+09,kenneth j robinson
502921,99550,0.500036,2365_left,1810972_right,"Robinson, Kenneth",kenneth robinson,2.570665e+09,kenneth j robinson
322343,132216,0.500017,3025_left,1674391_right,"Sipple, John",john sipple,2.418990e+09,john l sipple


In [None]:
df_filtered = df_merged[df_merged['network_name'] != df_merged['NormalizedName']] #drop exact matches
df_filtered = df_filtered.dropna(subset=['NormalizedName']) # drop NAs in right dataset
df_filtered = df_filtered.sort_values("best_match_score", ascending=False) # sort so highest values are first
df_filtered = df_filtered.drop_duplicates(subset='network_name', keep="first")
df_filtered = df_filtered[df_filtered['best_match_score'] >= .8]
df_filtered = df_filtered[df_filtered['best_match_score'] < .9]
df_filtered

In [50]:
import dask.dataframe as dd
import csv

os.chdir('/home/timothyelder/Documents')

path = '/project/jevans/MAG_0802_2021_snap_shot/'

journals_df = dd.read_csv(path + 'Journals.txt', sep="\t", header=None, dtype={5: 'object'},
                                                       error_bad_lines=False, quoting=csv.QUOTE_NONE,
                                                       encoding='utf-8')

new_columns = ['JournalId', 'Rank', 'NormalizedName',
               'DisplayName', 'Issn', 'Publisher',
               'Webpage', 'PaperCount', 'PaperFamilyCount',
               'CitationCount', 'CreatedDate']

journals_df = journals_df.rename(columns=dict(zip(journals_df.columns, new_columns)))

journals_df = journals_df.drop(columns=['Rank', 'DisplayName', 'Issn', 'Publisher',
                          'Webpage', 'PaperCount', 'PaperFamilyCount',
                          'CitationCount', 'CreatedDate'])

journals_df.head()




  **kwargs,


  path_info,


Unnamed: 0,JournalId,NormalizedName
0,465895,eureka
1,1137746,the artist and journal of home culture
2,2978343,cumberland law review
3,3010151,comparative medicine east and west
4,3164724,physiological measurement


In [1]:
import pandas as pd 
papers_df = pd.read_csv('/home/timothyelder/Documents/data/disambig/filtered_papers.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
df = papers_df[papers_df['DocType'] == 'Journal']

Unnamed: 0.1,Unnamed: 0,best_match_score,__id_left,__id_right,faculty_name,network_name,AuthorId,NormalizedName
65,2990,0.659706,74_left,1571423_right,"Blankenship, Kim M.",kim m blankenship,3.048999e+09,kim m blankenship
295,13260,0.712993,325_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
302,13550,0.605695,332_left,310220_right,"Smith, David H.",david h smith,3.043000e+09,david h smith
308,13798,0.712993,338_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
343,15578,0.712993,378_left,821791_right,"Pfohl, Stephen J.",stephen j pfohl,3.045411e+09,stephen j pfohl
...,...,...,...,...,...,...,...,...
950889,910798,0.463892,25229_left,349390_right,"Gerber, Theodore P.",theodore p gerber,3.066343e+09,p gerber
950915,911705,0.463892,25255_left,349390_right,"Gerber, Theodore P.",theodore p gerber,3.066343e+09,p gerber
950926,912139,0.451456,25266_left,215925_right,"Schwartz, Christine R.",christine r schwartz,3.066187e+09,r schwartz
950938,912679,0.463892,25278_left,349390_right,"Gerber, Theodore P.",theodore p gerber,3.066343e+09,p gerber


In [11]:
df = df.drop(columns=['Unnamed: 0','Rank', 'Doi', 'DocType', 'PaperTitle',
       'OriginalTitle', 'BookTitle', 'Year', 'Date', 'OnlineDate', 'Publisher',
       'ConferenceSeriesId', 'ConferenceInstanceId', 'Volume',
       'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'CitationCount',
       'EstimatedCitation', 'OriginalVenue', 'FamilyId', 'FamilyRank',
       'DocSubTypes', 'CreatedDate'])

In [12]:
df.to_csv('/home/timothyelder/Documents/data/disambig/edge_list.csv')