# PCAD Notebook 2

This notebook processes the clustered and cleaned print and electronic data produced by PCAD notebook 1, adds p/e tracking data, and melts group IDs for OCLC numbers and ISSNs. It produces a single merged dataframe with an ISSN matchpoint for further operations.

Required files/inputs:
- `cleaned_print_{date}.pkl` file produced by PCAD Notebook 1 from print data
- `cleaned_e_{date}.pkl` file produced by PCAD Notebook 1 from electronic data

Outputs:
- `p_and_e_{date}.pkl` file of merged and enhanced data (this is the main dataframe)

In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import date
today = str(date.today()).replace('-','')

In [9]:
#change filenames
pdf = pd.read_pickle('cleaned_print_20201106.pkl')
edf = pd.read_pickle('cleaned_e_20201106.pkl')
print(pdf.shape)
print(edf.shape)

(103137, 10)
(25961, 10)


#### Add a column called "p_or_e" where we can track if the record came from the physical serials batch or the matched electronic records we pulled from Alma sets

In [10]:
pdf['p_or_e'] = 'p'
pdf

Unnamed: 0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e
0,9920341180001701,Tutkimuksia Suomen maatalouden kannattavuudesta =,[9104278],[0438-9808],[],[],[],[],[9104278],[0438-9808],p
0,9920358800001701,Pakistan,[25380502],[1061-6101],[],[],[],[],[25380502],[1061-6101],p
0,9920370170001701,Abhandlungen der Königlich Preussischen Akade...,[10333878],[0233-2728],[],[],[],[],[10333878],[0233-2728],p
0,9920380570001701,Zhongguo dian ying nian jian /,[9179105],[],[],[],[],[],[9179105],[],p
0,9920446280001701,Sports 'n spokes,[1114948794],[0161-6706],[],[],[],[],[1114948794],[0161-6706],p
...,...,...,...,...,...,...,...,...,...,...,...
0,9959871440001701,"Outlook (New York, N.Y. : 1893)",[5361126],[],[],[],[],[],[5361126],[],p
0,9959879870001701,Profile of health plans and utilization review...,[38314157],[],[],[],[],[],[38314157],[],p
0,9959887230001701,"Economic survey, Finland",[38364450],[1239-209X],[],"[1455-7606, 0532-9280]",[],[],[38364450],"[0532-9280, 1455-7606, 1239-209X]",p
0,9959892490001701,Aakrosh :,[41503026],[0971-7862],[],[],[],[],[41503026],[0971-7862],p


In [11]:
edf['p_or_e'] = 'e'
edf

Unnamed: 0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e
0,9912490700001701,Whittington and his cat.,[642758868],[],[],[],[],[],[642758868],[],e
0,9914024810001701,Thank you for meeting with me to prepare your ...,[56313931],[],[],[],[4339.],[T 22.44/2:4339],[56313931],[],e
0,9915256360001701,Battleground Chicago :,[57560036],[],"[905749810, 55600826, 991936001, 149214895, 10...",[],[],[],"[1020529306, 149214895, 57560036, 55600826, 99...",[],e
0,9915894870001701,Strengthening forest law enforcement and gover...,[156899011],[],[],[],[],[],[156899011],[],e
0,9916471080001701,"An astronomical diary, or an almanack, for the...",[62820167],[],[],[],[no. 6930.],[],[62820167],[],e
...,...,...,...,...,...,...,...,...,...,...,...
0,9977136268101701,F©œldtani k©œzl©œny,[1569510],[0015-542X],[],[],[],[],[1569510],[0015-542X],e
0,9977149036101701,Corporate report Wisconsin,[12795345],[],[],[0890-4278],[],[],[12795345],[0890-4278],e
0,9977149197201701,Surveying and land information systems,[21396434],[1052-2905],[],[],[],[],[21396434],[1052-2905],e
0,9977149202001701,Engineering,[1567895],[0013-7782],[],[],[],[],[1567895],[0013-7782],e


#### Combine p df and e df into one df

In [12]:
#Next two lines to help verify that dataframes have the same columns, if necessary
edf.columns
pdf.columns
epdf = pd.concat([pdf,edf],ignore_index=True)
epdf

Unnamed: 0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e
0,9920341180001701,Tutkimuksia Suomen maatalouden kannattavuudesta =,[9104278],[0438-9808],[],[],[],[],[9104278],[0438-9808],p
1,9920358800001701,Pakistan,[25380502],[1061-6101],[],[],[],[],[25380502],[1061-6101],p
2,9920370170001701,Abhandlungen der Königlich Preussischen Akade...,[10333878],[0233-2728],[],[],[],[],[10333878],[0233-2728],p
3,9920380570001701,Zhongguo dian ying nian jian /,[9179105],[],[],[],[],[],[9179105],[],p
4,9920446280001701,Sports 'n spokes,[1114948794],[0161-6706],[],[],[],[],[1114948794],[0161-6706],p
...,...,...,...,...,...,...,...,...,...,...,...
129093,9977136268101701,F©œldtani k©œzl©œny,[1569510],[0015-542X],[],[],[],[],[1569510],[0015-542X],e
129094,9977149036101701,Corporate report Wisconsin,[12795345],[],[],[0890-4278],[],[],[12795345],[0890-4278],e
129095,9977149197201701,Surveying and land information systems,[21396434],[1052-2905],[],[],[],[],[21396434],[1052-2905],e
129096,9977149202001701,Engineering,[1567895],[0013-7782],[],[],[],[],[1567895],[0013-7782],e


#### Define tidy data function (melt_group_ids()) for issns and ocns

In [13]:
def melt_group_ids ( df, identifier_column, identifier_name, group_name ):
    
    # This function takes a column of multi-valued cells in a dataframe, explodes them out into single-valued cells still 
    # linked to their row ID.  This data is then merged with a database-style join with the input dataframe

    melted_ids = pd.concat([pd.DataFrame(v, index=np.repeat(k,len(v)))
                              for k,v in df[identifier_column].to_dict().items()])
    melted_ids = melted_ids.rename(columns={0:identifier_name})
    print('melted')
    print(melted_ids.columns)

    df2 = pd.DataFrame()
    df2 = pd.merge(df,melted_ids,how="inner",left_index=True,right_index=True)
    df2 = df2.reset_index()
    df2 = df2.rename(columns={"index":"record_index"})
    print('merged')
    print(df2.columns)

    df3 = pd.DataFrame()
    df3 = df2.groupby([identifier_name]).ngroup()
    print('grouped')

    groups = df3.to_frame()
    groups.rename(columns={0: group_name + '_group_id' },inplace=True)
    print(groups.columns)
    
    eg = pd.merge(df2[['record_index']],groups,left_index=True,right_index=True, how="inner")
    
    print(eg.columns)
    
    df6 = pd.DataFrame()
    df6 = eg.groupby(['record_index']).agg(lambda x: list(set(x))).reset_index()
    print('re-grouped')
    print(df6.columns)
    
    while df6[group_name + '_group_id'].str.len().max() > 1:
        for each in tqdm(df6[group_name + '_group_id'],desc="reducing progress", unit="records"):
            if len(each) > 1:
                #print(each)
                val = each[0]
                for x in each[1:]:
                    #print(x)
                    eg[group_name + '_group_id'][eg[group_name + '_group_id'] == x] = val
                    eg.drop_duplicates(inplace=True)
        df6 = pd.DataFrame()
        df6 = eg.groupby(['record_index']).agg(lambda x: list(set(x))).reset_index()
    eg = eg.sort_values([group_name + '_group_id'])
    
    print('merging back into df')
    eg1 = pd.merge(df,eg,how='left',left_index=True,right_on='record_index')
    eg1.set_index('record_index',inplace=True)
    
    
    matches = eg1[eg1[group_name + '_group_id'].notnull()]
    matches[group_name + '_group_id'] = matches[group_name + '_group_id'].astype(int)
    group_ids = sorted(list(set(matches[group_name + '_group_id'])))
    group_ids = list(map(int, group_ids))
    group_list = list(range(0,len(group_ids)))
    group_list = list(map(int, group_list))
    group_dict = dict(zip(group_ids,group_list))
    
    matches[group_name + '_group_id'].replace(group_dict, inplace=True)
    
    no_matches = eg1[eg1[group_name + '_group_id'].isnull()]
    no_matches[group_name + '_group_id'] = list(range(len(group_ids),len(group_ids)+len(no_matches)))
    output = pd.concat([matches,no_matches])
    output.sort_values(group_name + '_group_id',inplace=True)
    
    return output

#### Run melt_group_ids() function for ISSNs

In [14]:
df_issns = melt_group_ids ( epdf, 'ISSN_cluster', 'ISSN_y', 'ISSN' )
df_issns

melted
Index(['ISSN_y'], dtype='object')
merged
Index(['record_index', 'MMS_ID', 'Title', 'OCN', 'ISSN', 'Related_OCNs',
       'Related_ISSNs', 'Vol_nos', 'Gov_doc_nos', 'OCN_cluster',
       'ISSN_cluster', 'p_or_e', 'ISSN_y'],
      dtype='object')
grouped
Index(['ISSN_group_id'], dtype='object')
Index(['record_index', 'ISSN_group_id'], dtype='object')
re-grouped
Index(['record_index', 'ISSN_group_id'], dtype='object')


reducing progress: 100%|████████████████████████████████████████████████| 72826/72826 [03:11<00:00, 379.66records/s]
reducing progress: 100%|███████████████████████████████████████████████| 72826/72826 [00:11<00:00, 6289.67records/s]
reducing progress: 100%|██████████████████████████████████████████████| 72826/72826 [00:00<00:00, 80566.93records/s]
reducing progress: 100%|█████████████████████████████████████████████| 72826/72826 [00:00<00:00, 282274.04records/s]
reducing progress: 100%|█████████████████████████████████████████████| 72826/72826 [00:00<00:00, 570852.08records/s]
reducing progress: 100%|█████████████████████████████████████████████| 72826/72826 [00:00<00:00, 983601.14records/s]


merging back into df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
87349,9935224330001701,Publishers' world.,[988619],[0555-6384],"[2489456, 567791231, 1695359]",[0000-0019],[],[],"[567791231, 1695359, 988619, 2489456]","[0000-0019, 0555-6384]",p,0
60626,9934112930001701,Publishers weekly,[2489456],[0000-0019],"[37309426, 9604938]","[0000-0019, 000--0019, 0000-0469, 2150-4008]",[],[],"[37309426, 9604938, 2489456]","[2150-4008, 0000-0469, 0000-0019]",p,0
52478,9937257820001701,The Book publishing annual,[1114932096],[0000-0787],[],[0000-0019],[],[],[1114932096],"[0000-0787, 0000-0019]",p,0
105258,9967008940001701,Publishers weekly,[37309426],[2150-4008],[],[0000-0019],[],[],[37309426],"[2150-4008, 0000-0019]",e,0
58136,9913446020001701,Publishers weekly yearbook,[9604938],[0000-0469],[],[0000-0019],[],[],[9604938],"[0000-0469, 0000-0019]",p,0
...,...,...,...,...,...,...,...,...,...,...,...,...
129083,9977106226701701,Research programs of the U.S. Bureau of Mines,[58596669],[],[],[],[],[],[58596669],[],e,103450
129084,9977106288001701,Towards a sustainable recovery,[785779466],[],[],[],[],[],[785779466],[],e,103451
129085,9977106545201701,Unhooking from Whiteness :,[865106949],[],[],[],[6.],[],[865106949],[],e,103452
129086,9977118904301701,Geoenvironmental engineering,[299108498],[],[],[],[],[],[299108498],[],e,103453


In [15]:
df_issns.sort_index()

Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,9920341180001701,Tutkimuksia Suomen maatalouden kannattavuudesta =,[9104278],[0438-9808],[],[],[],[],[9104278],[0438-9808],p,24110
1,9920358800001701,Pakistan,[25380502],[1061-6101],[],[],[],[],[25380502],[1061-6101],p,36045
2,9920370170001701,Abhandlungen der Königlich Preussischen Akade...,[10333878],[0233-2728],[],[],[],[],[10333878],[0233-2728],p,17151
3,9920380570001701,Zhongguo dian ying nian jian /,[9179105],[],[],[],[],[],[9179105],[],p,47183
4,9920446280001701,Sports 'n spokes,[1114948794],[0161-6706],[],[],[],[],[1114948794],[0161-6706],p,14948
...,...,...,...,...,...,...,...,...,...,...,...,...
129093,9977136268101701,F©œldtani k©œzl©œny,[1569510],[0015-542X],[],[],[],[],[1569510],[0015-542X],e,2563
129094,9977149036101701,Corporate report Wisconsin,[12795345],[],[],[0890-4278],[],[],[12795345],[0890-4278],e,31641
129095,9977149197201701,Surveying and land information systems,[21396434],[1052-2905],[],[],[],[],[21396434],[1052-2905],e,35481
129096,9977149202001701,Engineering,[1567895],[0013-7782],[],[],[],[],[1567895],[0013-7782],e,2261


In [16]:
#Change filenames
df_issns.to_pickle(f'epdf_issns_grouped_{today}.pkl')
df_issns.to_csv(f'epdf_issns_grouped_{today}.txt',sep='\t')

#### Run melt_group_ids() function for OCNs

In [17]:
df_ocns = melt_group_ids ( df_issns, 'OCN_cluster', 'OCN_y', 'OCN' )
df_ocns

melted
Index(['OCN_y'], dtype='object')
merged
Index(['record_index', 'MMS_ID', 'Title', 'OCN', 'ISSN', 'Related_OCNs',
       'Related_ISSNs', 'Vol_nos', 'Gov_doc_nos', 'OCN_cluster',
       'ISSN_cluster', 'p_or_e', 'ISSN_group_id', 'OCN_y'],
      dtype='object')
grouped
Index(['OCN_group_id'], dtype='object')
Index(['record_index', 'OCN_group_id'], dtype='object')
re-grouped
Index(['record_index', 'OCN_group_id'], dtype='object')


reducing progress: 100%|██████████████████████████████████████████████| 127237/127237 [09:18<00:00, 227.74records/s]
reducing progress: 100%|████████████████████████████████████████████| 127237/127237 [00:10<00:00, 12514.22records/s]
reducing progress: 100%|████████████████████████████████████████████| 127237/127237 [00:01<00:00, 65912.46records/s]
reducing progress: 100%|███████████████████████████████████████████| 127237/127237 [00:01<00:00, 112396.75records/s]
reducing progress: 100%|███████████████████████████████████████████| 127237/127237 [00:00<00:00, 151461.29records/s]
reducing progress: 100%|███████████████████████████████████████████| 127237/127237 [00:00<00:00, 183537.17records/s]
reducing progress: 100%|███████████████████████████████████████████| 127237/127237 [00:00<00:00, 241050.41records/s]
reducing progress: 100%|███████████████████████████████████████████| 127237/127237 [00:00<00:00, 301288.41records/s]
reducing progress: 100%|████████████████████████████████████████

merging back into df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
105867,9977176693001701,Paraphrasis in omnes epistolas Apostolicas.,[944177912],[],[(OCoLC)1002244040],[],[],[],"[944177912, (OCoLC)1002244040]",[],e,97748,0
108037,9977166332001701,The Grammar of Attic Inscriptions.,[979747922],[],[(OCoLC)1011439387],[],[],[],"[979747922, (OCoLC)1011439387]",[],e,98302,1
119392,9977176707601701,When Television was Young :,[944177246],[],[(OCoLC)1013946339],[],[],[],"[(OCoLC)1013946339, 944177246]",[],e,101278,2
127588,9975720543401701,Moskaus spuren in Ostdeutschland 1945 bis 1949 :,[920780121],[],[(OCoLC)1013950907],[],[Band 22.],[],"[920780121, (OCoLC)1013950907]",[],e,103225,3
120765,9976895243601701,Scientists at War :,[905969803],[],[(OCoLC)1013954240],[],[],[],"[905969803, (OCoLC)1013954240]",[],e,101705,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117940,9975297262301701,SEC today (CCH).,[],[],[],[0745-2667 (print)],[],[],[],[],e,100923,118674
118942,9975297322401701,Search & seizure bulletin.,[],[],[],[0037-0193 (print)],[],[],[],[],e,101048,118675
120184,9975297304901701,Federal postconviction remedies handbook.,[],[],[],[1932-2151 (print)],[],[],[],[],e,101519,118676
122673,9977114779201701,Annales de la Société entomologique de France ...,[],[],[],[],[],[],[],[],e,102205,118677


In [18]:
df = df_ocns

#### Normalize the two group ids into one group id

In [19]:
df['ISSN_group_id']= df['ISSN_group_id'].apply(lambda x: [str(int(x)) + 'i'])
df

Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
105867,9977176693001701,Paraphrasis in omnes epistolas Apostolicas.,[944177912],[],[(OCoLC)1002244040],[],[],[],"[944177912, (OCoLC)1002244040]",[],e,[97748i],0
108037,9977166332001701,The Grammar of Attic Inscriptions.,[979747922],[],[(OCoLC)1011439387],[],[],[],"[979747922, (OCoLC)1011439387]",[],e,[98302i],1
119392,9977176707601701,When Television was Young :,[944177246],[],[(OCoLC)1013946339],[],[],[],"[(OCoLC)1013946339, 944177246]",[],e,[101278i],2
127588,9975720543401701,Moskaus spuren in Ostdeutschland 1945 bis 1949 :,[920780121],[],[(OCoLC)1013950907],[],[Band 22.],[],"[920780121, (OCoLC)1013950907]",[],e,[103225i],3
120765,9976895243601701,Scientists at War :,[905969803],[],[(OCoLC)1013954240],[],[],[],"[905969803, (OCoLC)1013954240]",[],e,[101705i],4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117940,9975297262301701,SEC today (CCH).,[],[],[],[0745-2667 (print)],[],[],[],[],e,[100923i],118674
118942,9975297322401701,Search & seizure bulletin.,[],[],[],[0037-0193 (print)],[],[],[],[],e,[101048i],118675
120184,9975297304901701,Federal postconviction remedies handbook.,[],[],[],[1932-2151 (print)],[],[],[],[],e,[101519i],118676
122673,9977114779201701,Annales de la Société entomologique de France ...,[],[],[],[],[],[],[],[],e,[102205i],118677


In [20]:
df['OCN_group_id']= df['OCN_group_id'].apply(lambda x: [str(int(x)) + 'o'])
df

Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
105867,9977176693001701,Paraphrasis in omnes epistolas Apostolicas.,[944177912],[],[(OCoLC)1002244040],[],[],[],"[944177912, (OCoLC)1002244040]",[],e,[97748i],[0o]
108037,9977166332001701,The Grammar of Attic Inscriptions.,[979747922],[],[(OCoLC)1011439387],[],[],[],"[979747922, (OCoLC)1011439387]",[],e,[98302i],[1o]
119392,9977176707601701,When Television was Young :,[944177246],[],[(OCoLC)1013946339],[],[],[],"[(OCoLC)1013946339, 944177246]",[],e,[101278i],[2o]
127588,9975720543401701,Moskaus spuren in Ostdeutschland 1945 bis 1949 :,[920780121],[],[(OCoLC)1013950907],[],[Band 22.],[],"[920780121, (OCoLC)1013950907]",[],e,[103225i],[3o]
120765,9976895243601701,Scientists at War :,[905969803],[],[(OCoLC)1013954240],[],[],[],"[905969803, (OCoLC)1013954240]",[],e,[101705i],[4o]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117940,9975297262301701,SEC today (CCH).,[],[],[],[0745-2667 (print)],[],[],[],[],e,[100923i],[118674o]
118942,9975297322401701,Search & seizure bulletin.,[],[],[],[0037-0193 (print)],[],[],[],[],e,[101048i],[118675o]
120184,9975297304901701,Federal postconviction remedies handbook.,[],[],[],[1932-2151 (print)],[],[],[],[],e,[101519i],[118676o]
122673,9977114779201701,Annales de la Société entomologique de France ...,[],[],[],[],[],[],[],[],e,[102205i],[118677o]


In [21]:
df['both_groups'] = df['ISSN_group_id'] + df['OCN_group_id']
df

Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
105867,9977176693001701,Paraphrasis in omnes epistolas Apostolicas.,[944177912],[],[(OCoLC)1002244040],[],[],[],"[944177912, (OCoLC)1002244040]",[],e,[97748i],[0o],"[97748i, 0o]"
108037,9977166332001701,The Grammar of Attic Inscriptions.,[979747922],[],[(OCoLC)1011439387],[],[],[],"[979747922, (OCoLC)1011439387]",[],e,[98302i],[1o],"[98302i, 1o]"
119392,9977176707601701,When Television was Young :,[944177246],[],[(OCoLC)1013946339],[],[],[],"[(OCoLC)1013946339, 944177246]",[],e,[101278i],[2o],"[101278i, 2o]"
127588,9975720543401701,Moskaus spuren in Ostdeutschland 1945 bis 1949 :,[920780121],[],[(OCoLC)1013950907],[],[Band 22.],[],"[920780121, (OCoLC)1013950907]",[],e,[103225i],[3o],"[103225i, 3o]"
120765,9976895243601701,Scientists at War :,[905969803],[],[(OCoLC)1013954240],[],[],[],"[905969803, (OCoLC)1013954240]",[],e,[101705i],[4o],"[101705i, 4o]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117940,9975297262301701,SEC today (CCH).,[],[],[],[0745-2667 (print)],[],[],[],[],e,[100923i],[118674o],"[100923i, 118674o]"
118942,9975297322401701,Search & seizure bulletin.,[],[],[],[0037-0193 (print)],[],[],[],[],e,[101048i],[118675o],"[101048i, 118675o]"
120184,9975297304901701,Federal postconviction remedies handbook.,[],[],[],[1932-2151 (print)],[],[],[],[],e,[101519i],[118676o],"[101519i, 118676o]"
122673,9977114779201701,Annales de la Société entomologique de France ...,[],[],[],[],[],[],[],[],e,[102205i],[118677o],"[102205i, 118677o]"


In [22]:
def both_group_ids ( df, identifier_column, identifier_name, group_name ):

    melted_ids = pd.concat([pd.DataFrame(v, index=np.repeat(k,len(v)))
                              for k,v in df[identifier_column].to_dict().items()])
    melted_ids = melted_ids.rename(columns={0:identifier_name})
    print('melted')
    print(melted_ids.columns)

    df2 = pd.DataFrame()
    df2 = pd.merge(df,melted_ids,how="inner",left_index=True,right_index=True)
    df2 = df2.reset_index()
    df2 = df2.rename(columns={"index":"record_index"})
    print('merged')
    print(df2.columns)

    df3 = pd.DataFrame()
    df3 = df2.groupby([identifier_name]).ngroup()
    print('grouped')

    groups = df3.to_frame()
    groups.rename(columns={0: group_name + '_group_id' },inplace=True)
    print(groups.columns)
    
    eg = pd.merge(df2[['record_index']],groups,left_index=True,right_index=True, how="inner")
    
    print(eg.columns)
    
    df6 = pd.DataFrame()
    df6 = eg.groupby(['record_index']).agg(lambda x: list(set(x))).reset_index()
    print('re-grouped')
    print(df6.columns)
    
    while df6[group_name + '_group_id'].str.len().max() > 1:
        for each in tqdm(df6[group_name + '_group_id'],desc="reducing progress", unit="records"):
            if len(each) > 1:
                #print(each)
                val = each[0]
                for x in each[1:]:
                    #print(x)
                    eg[group_name + '_group_id'][eg[group_name + '_group_id'] == x] = val
                    eg.drop_duplicates(inplace=True)
        df6 = pd.DataFrame()
        df6 = eg.groupby(['record_index']).agg(lambda x: list(set(x))).reset_index()
    eg = eg.sort_values([group_name + '_group_id'])
    
    print('merging back into df')
    eg1 = pd.merge(df,eg,how='left',left_index=True,right_on='record_index')
    eg1.set_index('record_index',inplace=True)
    
    return eg1

In [23]:
pe_df = both_group_ids ( df, 'both_groups', 'matches', 'matches' )
pe_df

melted
Index(['matches'], dtype='object')
merged
Index(['record_index', 'MMS_ID', 'Title', 'OCN', 'ISSN', 'Related_OCNs',
       'Related_ISSNs', 'Vol_nos', 'Gov_doc_nos', 'OCN_cluster',
       'ISSN_cluster', 'p_or_e', 'ISSN_group_id', 'OCN_group_id',
       'both_groups', 'matches'],
      dtype='object')
grouped
Index(['matches_group_id'], dtype='object')
Index(['record_index', 'matches_group_id'], dtype='object')
re-grouped
Index(['record_index', 'matches_group_id'], dtype='object')


reducing progress: 100%|█████████████████████████████████████████████| 129098/129098 [1:06:41<00:00, 32.26records/s]
reducing progress: 100%|██████████████████████████████████████████████| 129098/129098 [03:18<00:00, 648.98records/s]
reducing progress: 100%|█████████████████████████████████████████████| 129098/129098 [00:27<00:00, 4674.03records/s]
reducing progress: 100%|████████████████████████████████████████████| 129098/129098 [00:11<00:00, 11152.84records/s]
reducing progress: 100%|████████████████████████████████████████████| 129098/129098 [00:09<00:00, 13422.88records/s]
reducing progress: 100%|████████████████████████████████████████████| 129098/129098 [00:09<00:00, 13738.05records/s]
reducing progress: 100%|████████████████████████████████████████████| 129098/129098 [00:09<00:00, 13659.58records/s]
reducing progress: 100%|████████████████████████████████████████████| 129098/129098 [00:09<00:00, 14209.30records/s]
reducing progress: 100%|████████████████████████████████████████

reducing progress: 100%|███████████████████████████████████████████| 129098/129098 [00:00<00:00, 318854.37records/s]
reducing progress: 100%|███████████████████████████████████████████| 129098/129098 [00:00<00:00, 449755.02records/s]
reducing progress: 100%|███████████████████████████████████████████| 129098/129098 [00:00<00:00, 504293.68records/s]
reducing progress: 100%|███████████████████████████████████████████| 129098/129098 [00:00<00:00, 635676.53records/s]


merging back into df


Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
105867,9977176693001701,Paraphrasis in omnes epistolas Apostolicas.,[944177912],[],[(OCoLC)1002244040],[],[],[],"[944177912, (OCoLC)1002244040]",[],e,[97748i],[0o],"[97748i, 0o]",1
108037,9977166332001701,The Grammar of Attic Inscriptions.,[979747922],[],[(OCoLC)1011439387],[],[],[],"[979747922, (OCoLC)1011439387]",[],e,[98302i],[1o],"[98302i, 1o]",44357
119392,9977176707601701,When Television was Young :,[944177246],[],[(OCoLC)1013946339],[],[],[],"[(OCoLC)1013946339, 944177246]",[],e,[101278i],[2o],"[101278i, 2o]",66579
127588,9975720543401701,Moskaus spuren in Ostdeutschland 1945 bis 1949 :,[920780121],[],[(OCoLC)1013950907],[],[Band 22.],[],"[920780121, (OCoLC)1013950907]",[],e,[103225i],[3o],"[103225i, 3o]",88801
120765,9976895243601701,Scientists at War :,[905969803],[],[(OCoLC)1013954240],[],[],[],"[905969803, (OCoLC)1013954240]",[],e,[101705i],[4o],"[101705i, 4o]",3788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117940,9975297262301701,SEC today (CCH).,[],[],[],[0745-2667 (print)],[],[],[],[],e,[100923i],[118674o],"[100923i, 118674o]",2050
118942,9975297322401701,Search & seizure bulletin.,[],[],[],[0037-0193 (print)],[],[],[],[],e,[101048i],[118675o],"[101048i, 118675o]",2328
120184,9975297304901701,Federal postconviction remedies handbook.,[],[],[],[1932-2151 (print)],[],[],[],[],e,[101519i],[118676o],"[101519i, 118676o]",26277
122673,9977114779201701,Annales de la Société entomologique de France ...,[],[],[],[],[],[],[],[],e,[102205i],[118677o],"[102205i, 118677o]",4900


In [24]:
pe_df.to_pickle(f'pe_df_with_groups_{today}.pkl')

In [25]:
pe_df.sort_values('matches_group_id')

Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
87349,9935224330001701,Publishers' world.,[988619],[0555-6384],"[2489456, 567791231, 1695359]",[0000-0019],[],[],"[567791231, 1695359, 988619, 2489456]","[0000-0019, 0555-6384]",p,[0i],[35959o],"[0i, 35959o]",0
105258,9967008940001701,Publishers weekly,[37309426],[2150-4008],[],[0000-0019],[],[],[37309426],"[2150-4008, 0000-0019]",e,[0i],[35959o],"[0i, 35959o]",0
58136,9913446020001701,Publishers weekly yearbook,[9604938],[0000-0469],[],[0000-0019],[],[],[9604938],"[0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0
60626,9934112930001701,Publishers weekly,[2489456],[0000-0019],"[37309426, 9604938]","[0000-0019, 000--0019, 0000-0469, 2150-4008]",[],[],"[37309426, 9604938, 2489456]","[2150-4008, 0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0
52478,9937257820001701,The Book publishing annual,[1114932096],[0000-0787],[],[0000-0019],[],[],[1114932096],"[0000-0787, 0000-0019]",p,[0i],[9122o],"[0i, 9122o]",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30531,9947018110001701,Bank of London & South America review,[317183571],[0005-5298],[],[],[],[],[317183571],[0005-5298],p,[999i],[63489o],"[999i, 63489o]",222128
32742,9927107280001701,Technical report (Washington State Institute o...,[1013255009],[0511-2699],[],[],[],[],[1013255009],[0511-2699],p,[25077i],[999o],"[25077i, 999o]",222129
33650,9914101670001701,Abstracts of health care management studies,[1114928547],[0194-4908],[],[0001-3595],[],[],[1114928547],"[0001-3595, 0194-4908]",p,[99i],[7966o],"[99i, 7966o]",222130
22003,9927245370001701,Abstracts of hospital management studies,[1460587],[0001-3595],[],[],[],[],[1460587],[0001-3595],p,[99i],[25055o],"[99i, 25055o]",222130


In [26]:
group_ids = sorted(list(set(pe_df['matches_group_id'])))
group_list = list(range(0,len(group_ids)))
group_dict = dict(zip(group_ids,group_list))
len(group_dict)

101718

In [27]:
pe_df['matches_group_id'].replace(group_dict, inplace=True)
pe_df

Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
105867,9977176693001701,Paraphrasis in omnes epistolas Apostolicas.,[944177912],[],[(OCoLC)1002244040],[],[],[],"[944177912, (OCoLC)1002244040]",[],e,[97748i],[0o],"[97748i, 0o]",1
108037,9977166332001701,The Grammar of Attic Inscriptions.,[979747922],[],[(OCoLC)1011439387],[],[],[],"[979747922, (OCoLC)1011439387]",[],e,[98302i],[1o],"[98302i, 1o]",19245
119392,9977176707601701,When Television was Young :,[944177246],[],[(OCoLC)1013946339],[],[],[],"[(OCoLC)1013946339, 944177246]",[],e,[101278i],[2o],"[101278i, 2o]",29440
127588,9975720543401701,Moskaus spuren in Ostdeutschland 1945 bis 1949 :,[920780121],[],[(OCoLC)1013950907],[],[Band 22.],[],"[920780121, (OCoLC)1013950907]",[],e,[103225i],[3o],"[103225i, 3o]",39595
120765,9976895243601701,Scientists at War :,[905969803],[],[(OCoLC)1013954240],[],[],[],"[905969803, (OCoLC)1013954240]",[],e,[101705i],[4o],"[101705i, 4o]",1707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117940,9975297262301701,SEC today (CCH).,[],[],[],[0745-2667 (print)],[],[],[],[],e,[100923i],[118674o],"[100923i, 118674o]",924
118942,9975297322401701,Search & seizure bulletin.,[],[],[],[0037-0193 (print)],[],[],[],[],e,[101048i],[118675o],"[101048i, 118675o]",1048
120184,9975297304901701,Federal postconviction remedies handbook.,[],[],[],[1932-2151 (print)],[],[],[],[],e,[101519i],[118676o],"[101519i, 118676o]",10866
122673,9977114779201701,Annales de la Société entomologique de France ...,[],[],[],[],[],[],[],[],e,[102205i],[118677o],"[102205i, 118677o]",2200


In [28]:
pe_df.sort_index()

Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,9920341180001701,Tutkimuksia Suomen maatalouden kannattavuudesta =,[9104278],[0438-9808],[],[],[],[],[9104278],[0438-9808],p,[24110i],[112687o],"[24110i, 112687o]",23378
1,9920358800001701,Pakistan,[25380502],[1061-6101],[],[],[],[],[25380502],[1061-6101],p,[36045i],[57957o],"[36045i, 57957o]",35688
2,9920370170001701,Abhandlungen der Königlich Preussischen Akade...,[10333878],[0233-2728],[],[],[],[],[10333878],[0233-2728],p,[17151i],[2068o],"[17151i, 2068o]",16296
3,9920380570001701,Zhongguo dian ying nian jian /,[9179105],[],[],[],[],[],[9179105],[],p,[47183i],[112965o],"[47183i, 112965o]",46641
4,9920446280001701,Sports 'n spokes,[1114948794],[0161-6706],[],[],[],[],[1114948794],[0161-6706],p,[14948i],[15060o],"[14948i, 15060o]",14160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129093,9977136268101701,F©œldtani k©œzl©œny,[1569510],[0015-542X],[],[],[],[],[1569510],[0015-542X],e,[2563i],[30451o],"[2563i, 30451o]",29887
129094,9977149036101701,Corporate report Wisconsin,[12795345],[],[],[0890-4278],[],[],[12795345],[0890-4278],e,[31641i],[22542o],"[31641i, 22542o]",31115
129095,9977149197201701,Surveying and land information systems,[21396434],[1052-2905],[],[],[],[],[21396434],[1052-2905],e,[35481i],[51436o],"[35481i, 51436o]",9351
129096,9977149202001701,Engineering,[1567895],[0013-7782],[],[],[],[],[1567895],[0013-7782],e,[2261i],[29970o],"[2261i, 29970o]",21891


In [29]:
pe_df.sort_values('matches_group_id', inplace=True)
pe_df

Unnamed: 0_level_0,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id
record_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
87349,9935224330001701,Publishers' world.,[988619],[0555-6384],"[2489456, 567791231, 1695359]",[0000-0019],[],[],"[567791231, 1695359, 988619, 2489456]","[0000-0019, 0555-6384]",p,[0i],[35959o],"[0i, 35959o]",0
105258,9967008940001701,Publishers weekly,[37309426],[2150-4008],[],[0000-0019],[],[],[37309426],"[2150-4008, 0000-0019]",e,[0i],[35959o],"[0i, 35959o]",0
58136,9913446020001701,Publishers weekly yearbook,[9604938],[0000-0469],[],[0000-0019],[],[],[9604938],"[0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0
60626,9934112930001701,Publishers weekly,[2489456],[0000-0019],"[37309426, 9604938]","[0000-0019, 000--0019, 0000-0469, 2150-4008]",[],[],"[37309426, 9604938, 2489456]","[2150-4008, 0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0
52478,9937257820001701,The Book publishing annual,[1114932096],[0000-0787],[],[0000-0019],[],[],[1114932096],"[0000-0787, 0000-0019]",p,[0i],[9122o],"[0i, 9122o]",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30531,9947018110001701,Bank of London & South America review,[317183571],[0005-5298],[],[],[],[],[317183571],[0005-5298],p,[999i],[63489o],"[999i, 63489o]",101714
32742,9927107280001701,Technical report (Washington State Institute o...,[1013255009],[0511-2699],[],[],[],[],[1013255009],[0511-2699],p,[25077i],[999o],"[25077i, 999o]",101715
33650,9914101670001701,Abstracts of health care management studies,[1114928547],[0194-4908],[],[0001-3595],[],[],[1114928547],"[0001-3595, 0194-4908]",p,[99i],[7966o],"[99i, 7966o]",101716
22003,9927245370001701,Abstracts of hospital management studies,[1460587],[0001-3595],[],[],[],[],[1460587],[0001-3595],p,[99i],[25055o],"[99i, 25055o]",101716


In [30]:
pe_df.to_pickle(f'pe_df_groupids_matched_{today}.pkl')
#pe_df.to_csv('pe_df_' + today + '.txt', sep='\t',index=False)

In [31]:
pe_df.index.names = ['record_index']
pe_df.reset_index(inplace=True)
pe_df

Unnamed: 0,record_index,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id
0,87349,9935224330001701,Publishers' world.,[988619],[0555-6384],"[2489456, 567791231, 1695359]",[0000-0019],[],[],"[567791231, 1695359, 988619, 2489456]","[0000-0019, 0555-6384]",p,[0i],[35959o],"[0i, 35959o]",0
1,105258,9967008940001701,Publishers weekly,[37309426],[2150-4008],[],[0000-0019],[],[],[37309426],"[2150-4008, 0000-0019]",e,[0i],[35959o],"[0i, 35959o]",0
2,58136,9913446020001701,Publishers weekly yearbook,[9604938],[0000-0469],[],[0000-0019],[],[],[9604938],"[0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0
3,60626,9934112930001701,Publishers weekly,[2489456],[0000-0019],"[37309426, 9604938]","[0000-0019, 000--0019, 0000-0469, 2150-4008]",[],[],"[37309426, 9604938, 2489456]","[2150-4008, 0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0
4,52478,9937257820001701,The Book publishing annual,[1114932096],[0000-0787],[],[0000-0019],[],[],[1114932096],"[0000-0787, 0000-0019]",p,[0i],[9122o],"[0i, 9122o]",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129093,30531,9947018110001701,Bank of London & South America review,[317183571],[0005-5298],[],[],[],[],[317183571],[0005-5298],p,[999i],[63489o],"[999i, 63489o]",101714
129094,32742,9927107280001701,Technical report (Washington State Institute o...,[1013255009],[0511-2699],[],[],[],[],[1013255009],[0511-2699],p,[25077i],[999o],"[25077i, 999o]",101715
129095,33650,9914101670001701,Abstracts of health care management studies,[1114928547],[0194-4908],[],[0001-3595],[],[],[1114928547],"[0001-3595, 0194-4908]",p,[99i],[7966o],"[99i, 7966o]",101716
129096,22003,9927245370001701,Abstracts of hospital management studies,[1460587],[0001-3595],[],[],[],[],[1460587],[0001-3595],p,[99i],[25055o],"[99i, 25055o]",101716


In [32]:
pe_df['ISSN_to_match'] = pe_df['ISSN'].apply(lambda x: str(x[0]))
pe_df

Unnamed: 0,record_index,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id,ISSN_to_match
0,87349,9935224330001701,Publishers' world.,[988619],[0555-6384],"[2489456, 567791231, 1695359]",[0000-0019],[],[],"[567791231, 1695359, 988619, 2489456]","[0000-0019, 0555-6384]",p,[0i],[35959o],"[0i, 35959o]",0,0555-6384
1,105258,9967008940001701,Publishers weekly,[37309426],[2150-4008],[],[0000-0019],[],[],[37309426],"[2150-4008, 0000-0019]",e,[0i],[35959o],"[0i, 35959o]",0,2150-4008
2,58136,9913446020001701,Publishers weekly yearbook,[9604938],[0000-0469],[],[0000-0019],[],[],[9604938],"[0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0,0000-0469
3,60626,9934112930001701,Publishers weekly,[2489456],[0000-0019],"[37309426, 9604938]","[0000-0019, 000--0019, 0000-0469, 2150-4008]",[],[],"[37309426, 9604938, 2489456]","[2150-4008, 0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0,0000-0019
4,52478,9937257820001701,The Book publishing annual,[1114932096],[0000-0787],[],[0000-0019],[],[],[1114932096],"[0000-0787, 0000-0019]",p,[0i],[9122o],"[0i, 9122o]",0,0000-0787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129093,30531,9947018110001701,Bank of London & South America review,[317183571],[0005-5298],[],[],[],[],[317183571],[0005-5298],p,[999i],[63489o],"[999i, 63489o]",101714,0005-5298
129094,32742,9927107280001701,Technical report (Washington State Institute o...,[1013255009],[0511-2699],[],[],[],[],[1013255009],[0511-2699],p,[25077i],[999o],"[25077i, 999o]",101715,0511-2699
129095,33650,9914101670001701,Abstracts of health care management studies,[1114928547],[0194-4908],[],[0001-3595],[],[],[1114928547],"[0001-3595, 0194-4908]",p,[99i],[7966o],"[99i, 7966o]",101716,0194-4908
129096,22003,9927245370001701,Abstracts of hospital management studies,[1460587],[0001-3595],[],[],[],[],[1460587],[0001-3595],p,[99i],[25055o],"[99i, 25055o]",101716,0001-3595


#### Find groups with p and e, p only, and e only

In [33]:
find_only_p_or_e = pe_df[['record_index','matches_group_id','p_or_e','MMS_ID','Title']]
find_only_p_or_e

Unnamed: 0,record_index,matches_group_id,p_or_e,MMS_ID,Title
0,87349,0,p,9935224330001701,Publishers' world.
1,105258,0,e,9967008940001701,Publishers weekly
2,58136,0,p,9913446020001701,Publishers weekly yearbook
3,60626,0,p,9934112930001701,Publishers weekly
4,52478,0,p,9937257820001701,The Book publishing annual
...,...,...,...,...,...
129093,30531,101714,p,9947018110001701,Bank of London & South America review
129094,32742,101715,p,9927107280001701,Technical report (Washington State Institute o...
129095,33650,101716,p,9914101670001701,Abstracts of health care management studies
129096,22003,101716,p,9927245370001701,Abstracts of hospital management studies


In [34]:
find_only_p_or_e = find_only_p_or_e.groupby(['matches_group_id']).agg(lambda x: sorted(list(set(x)))).reset_index()
find_only_p_or_e

Unnamed: 0,matches_group_id,record_index,p_or_e,MMS_ID
0,0,"[52478, 58136, 60626, 87349, 105258]","[e, p]","[9913446020001701, 9934112930001701, 993522433..."
1,1,[105867],[e],[9977176693001701]
2,2,"[69875, 113344]","[e, p]","[9939153590001701, 9968879310001701]"
3,3,[115629],[e],[9977077183601701]
4,4,[63006],[p],[9929611400001701]
...,...,...,...,...
101713,101713,[88528],[p],[9962739150001701]
101714,101714,[30531],[p],[9947018110001701]
101715,101715,[32742],[p],[9927107280001701]
101716,101716,"[22003, 33650]",[p],"[9914101670001701, 9927245370001701]"


In [35]:
find_only_p_or_e['pore'] = find_only_p_or_e['p_or_e'].apply(lambda x: ' '.join(x))
find_only_p_or_e

Unnamed: 0,matches_group_id,record_index,p_or_e,MMS_ID,pore
0,0,"[52478, 58136, 60626, 87349, 105258]","[e, p]","[9913446020001701, 9934112930001701, 993522433...",e p
1,1,[105867],[e],[9977176693001701],e
2,2,"[69875, 113344]","[e, p]","[9939153590001701, 9968879310001701]",e p
3,3,[115629],[e],[9977077183601701],e
4,4,[63006],[p],[9929611400001701],p
...,...,...,...,...,...
101713,101713,[88528],[p],[9962739150001701],p
101714,101714,[30531],[p],[9947018110001701],p
101715,101715,[32742],[p],[9927107280001701],p
101716,101716,"[22003, 33650]",[p],"[9914101670001701, 9927245370001701]",p


In [36]:
find_only_p_or_e['pore'].value_counts()

p      78993
e p    16937
e       5788
Name: pore, dtype: int64

In [37]:
only_p = find_only_p_or_e[find_only_p_or_e['pore'] == 'p']
only_p

Unnamed: 0,matches_group_id,record_index,p_or_e,MMS_ID,pore
4,4,[63006],[p],[9929611400001701],p
14,14,[91817],[p],[9945141400001701],p
18,18,[87852],[p],[9957807000001701],p
23,23,[64330],[p],[9953380130001701],p
26,26,"[13162, 50457]",[p],"[9925000870001701, 9925628280001701]",p
...,...,...,...,...,...
101713,101713,[88528],[p],[9962739150001701],p
101714,101714,[30531],[p],[9947018110001701],p
101715,101715,[32742],[p],[9927107280001701],p
101716,101716,"[22003, 33650]",[p],"[9914101670001701, 9927245370001701]",p


In [38]:
only_p_groups = list(only_p['matches_group_id'])
only_p_groups

[4,
 14,
 18,
 23,
 26,
 29,
 30,
 35,
 37,
 39,
 41,
 43,
 48,
 51,
 52,
 55,
 58,
 60,
 65,
 67,
 71,
 72,
 75,
 84,
 85,
 89,
 95,
 100,
 101,
 102,
 106,
 109,
 110,
 117,
 119,
 122,
 123,
 126,
 130,
 132,
 133,
 135,
 138,
 140,
 144,
 146,
 148,
 150,
 152,
 155,
 156,
 157,
 159,
 163,
 165,
 166,
 169,
 170,
 171,
 173,
 177,
 178,
 179,
 180,
 181,
 186,
 190,
 191,
 195,
 198,
 199,
 201,
 205,
 209,
 210,
 211,
 214,
 220,
 221,
 229,
 230,
 231,
 232,
 234,
 239,
 244,
 250,
 253,
 260,
 261,
 271,
 286,
 290,
 297,
 298,
 307,
 308,
 313,
 317,
 319,
 321,
 324,
 329,
 331,
 333,
 335,
 337,
 341,
 343,
 351,
 376,
 377,
 387,
 388,
 397,
 399,
 423,
 427,
 435,
 438,
 442,
 443,
 444,
 445,
 447,
 449,
 453,
 455,
 458,
 467,
 468,
 472,
 475,
 480,
 481,
 483,
 486,
 489,
 491,
 493,
 498,
 503,
 505,
 508,
 510,
 514,
 527,
 530,
 535,
 536,
 539,
 542,
 546,
 548,
 556,
 557,
 560,
 561,
 563,
 565,
 566,
 567,
 569,
 571,
 573,
 575,
 577,
 579,
 580,
 592,
 594,
 5

In [39]:
p_only_data = pe_df[pe_df['matches_group_id'].isin(only_p_groups)]
p_only_data

Unnamed: 0,record_index,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id,ISSN_to_match
9,63006,9929611400001701,Bottom line (American Nursery & Landscape Asso...,[66903536],[],[],[],[],[],[66903536],[],p,[77630i],[100003o],"[77630i, 100003o]",4,
26,91817,9945141400001701,French-language psychology,[6692160],[0167-1839],[],[],[],[],[6692160],[0167-1839],p,[15403i],[100015o],"[15403i, 100015o]",14,0167-1839
30,87852,9957807000001701,"History, technology, and art monograph.",[6693870],[0316-1269],[],[],[],[],[6693870],[0316-1269],p,[20279i],[100021o],"[20279i, 100021o]",18,0316-1269
36,64330,9953380130001701,Annual report of notifiable diseases,[669591472],[0575-7894],[],[],[],[CS82-201],[669591472],[0575-7894],p,[27104i],[100025o],"[27104i, 100025o]",23,0575-7894
39,13162,9925628280001701,American government,[6696647],[0891-3390],[],[],[],[],[6696647],[0891-3390],p,[31716i],[100027o],"[31716i, 100027o]",26,0891-3390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129093,30531,9947018110001701,Bank of London & South America review,[317183571],[0005-5298],[],[],[],[],[317183571],[0005-5298],p,[999i],[63489o],"[999i, 63489o]",101714,0005-5298
129094,32742,9927107280001701,Technical report (Washington State Institute o...,[1013255009],[0511-2699],[],[],[],[],[1013255009],[0511-2699],p,[25077i],[999o],"[25077i, 999o]",101715,0511-2699
129095,33650,9914101670001701,Abstracts of health care management studies,[1114928547],[0194-4908],[],[0001-3595],[],[],[1114928547],"[0001-3595, 0194-4908]",p,[99i],[7966o],"[99i, 7966o]",101716,0194-4908
129096,22003,9927245370001701,Abstracts of hospital management studies,[1460587],[0001-3595],[],[],[],[],[1460587],[0001-3595],p,[99i],[25055o],"[99i, 25055o]",101716,0001-3595


In [40]:
p_only_data.to_pickle('p_only_' + today + '.pkl')
#p_only_data.to_csv('p_only_' + today + '.txt',sep='\t',index=False)

In [41]:
only_e = find_only_p_or_e[find_only_p_or_e['pore'] == 'e']
only_e

Unnamed: 0,matches_group_id,record_index,p_or_e,MMS_ID,pore
1,1,[105867],[e],[9977176693001701],e
3,3,[115629],[e],[9977077183601701],e
6,6,[115633],[e],[9977077232801701],e
10,10,[115643],[e],[9977173932501701],e
11,11,[115645],[e],[9977174066201701],e
...,...,...,...,...,...
101705,101705,[115617],[e],[9977076678201701],e
101707,101707,[115619],[e],[9977076887201701],e
101708,101708,[115620],[e],[9977077010801701],e
101710,101710,[115621],[e],[9977077013001701],e


In [42]:
only_e_groups = list(only_e['matches_group_id'])
only_e_groups

[1,
 3,
 6,
 10,
 11,
 13,
 15,
 16,
 17,
 19,
 21,
 22,
 24,
 25,
 27,
 28,
 31,
 32,
 34,
 36,
 38,
 40,
 42,
 44,
 45,
 47,
 49,
 53,
 54,
 56,
 57,
 59,
 62,
 64,
 66,
 68,
 69,
 70,
 73,
 74,
 76,
 78,
 79,
 80,
 81,
 82,
 86,
 87,
 88,
 90,
 91,
 93,
 94,
 96,
 97,
 98,
 103,
 104,
 105,
 107,
 108,
 111,
 112,
 113,
 114,
 115,
 116,
 118,
 120,
 121,
 124,
 125,
 127,
 128,
 129,
 131,
 134,
 136,
 137,
 139,
 141,
 142,
 143,
 145,
 147,
 149,
 151,
 153,
 154,
 158,
 160,
 161,
 162,
 164,
 167,
 168,
 172,
 174,
 175,
 176,
 182,
 184,
 185,
 187,
 189,
 192,
 194,
 196,
 197,
 200,
 203,
 204,
 206,
 207,
 208,
 212,
 215,
 216,
 218,
 219,
 222,
 223,
 224,
 225,
 226,
 228,
 235,
 237,
 238,
 240,
 241,
 242,
 243,
 245,
 246,
 247,
 248,
 249,
 251,
 255,
 256,
 258,
 262,
 263,
 265,
 268,
 269,
 272,
 274,
 275,
 276,
 278,
 279,
 280,
 282,
 283,
 285,
 287,
 288,
 289,
 293,
 294,
 295,
 296,
 299,
 300,
 301,
 302,
 303,
 304,
 305,
 306,
 309,
 310,
 311,
 312,
 31

In [43]:
e_only_data = pe_df[pe_df['matches_group_id'].isin(only_e_groups)]
e_only_data

Unnamed: 0,record_index,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id,ISSN_to_match
5,105867,9977176693001701,Paraphrasis in omnes epistolas Apostolicas.,[944177912],[],[(OCoLC)1002244040],[],[],[],"[944177912, (OCoLC)1002244040]",[],e,[97748i],[0o],"[97748i, 0o]",1,
8,115629,9977077183601701,Oil Heat Commercial.,[715153010],[],[],[],[],[],[715153010],[],e,[100003i],[102418o],"[100003i, 102418o]",3,
12,115633,9977077232801701,The Other Side of the Fence /,[747798553],[],[],[],[],[],[747798553],[],e,[100007i],[103775o],"[100007i, 103775o]",6,
21,115643,9977173932501701,Climate policy and nonrenewable resources :,[(OCoLC-P)890414912],[],"[1081186361, 1055322052, 962662197, 961567878,...",[],[],[],"[1066448484, 1081186361, 889930911, 961567878,...",[],e,[100012i],[3212o],"[100012i, 3212o]",10,
22,115645,9977174066201701,Meaning and the Dynamics of Interpretation :,[862050200],[],[],[],[29.],[],[862050200],[],e,[100014i],[109995o],"[100014i, 109995o]",11,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129084,115617,9977076678201701,Boys Alone /,[747796406],[],[],[],[],[],[747796406],[],e,[99991i],[103731o],"[99991i, 103731o]",101705,
129086,115619,9977076887201701,"World fashion tour. Volume 6, Episode 2 /",[1112144940],[],[],[],[],[],[1112144940],[],e,[99993i],[5986o],"[99993i, 5986o]",101707,
129087,115620,9977077010801701,The Power of Vision /,[856905906],[],[],[],[],[],[856905906],[],e,[99994i],[109613o],"[99994i, 109613o]",101708,
129089,115621,9977077013001701,The quarry /,[985056105],[],[],[],[],[],[985056105],[],e,[99995i],[116206o],"[99995i, 116206o]",101710,


In [44]:
e_only_data.to_pickle('e_only_' + today + '.pkl')
e_only_data.to_csv('e_only_' + today + '.txt',sep='\t',index=False)

In [45]:
p_and_e = find_only_p_or_e[find_only_p_or_e['pore'] == 'e p']
p_and_e

Unnamed: 0,matches_group_id,record_index,p_or_e,MMS_ID,pore
0,0,"[52478, 58136, 60626, 87349, 105258]","[e, p]","[9913446020001701, 9934112930001701, 993522433...",e p
2,2,"[69875, 113344]","[e, p]","[9939153590001701, 9968879310001701]",e p
5,5,"[57684, 128733]","[e, p]","[9963550760001701, 9968441380001701]",e p
7,7,"[78492, 107439, 112352]","[e, p]","[9933370000001701, 9967145520001701, 996775956...",e p
8,8,"[22912, 76329, 115640]","[e, p]","[9957818690001701, 9963623700001701]",e p
...,...,...,...,...,...
101632,101632,"[77028, 115389]","[e, p]","[9959389050001701, 9968529900001701]",e p
101648,101648,"[9571, 107676]","[e, p]","[9946120900001701, 9974758564401701]",e p
101650,101650,"[2519, 112774]","[e, p]","[9939323230001701, 9968144710001701]",e p
101667,101667,"[21367, 105065]","[e, p]","[9935136780001701, 9969162920001701]",e p


In [46]:
pe_groups = list(p_and_e['matches_group_id'])
print(len(pe_groups))
pe_groups

16937


[0,
 2,
 5,
 7,
 8,
 9,
 12,
 20,
 33,
 46,
 50,
 61,
 63,
 77,
 83,
 92,
 99,
 183,
 188,
 193,
 202,
 213,
 217,
 227,
 233,
 236,
 252,
 254,
 257,
 259,
 264,
 266,
 267,
 270,
 273,
 277,
 281,
 284,
 291,
 292,
 315,
 326,
 327,
 340,
 373,
 375,
 398,
 422,
 431,
 433,
 434,
 440,
 461,
 462,
 466,
 477,
 496,
 501,
 511,
 513,
 526,
 533,
 537,
 544,
 555,
 584,
 588,
 627,
 629,
 635,
 638,
 639,
 660,
 674,
 686,
 702,
 707,
 711,
 720,
 739,
 744,
 747,
 764,
 784,
 798,
 803,
 804,
 819,
 822,
 826,
 838,
 875,
 877,
 879,
 889,
 895,
 899,
 902,
 914,
 919,
 936,
 956,
 967,
 969,
 1001,
 1003,
 1005,
 1016,
 1026,
 1038,
 1039,
 1042,
 1056,
 1066,
 1074,
 1086,
 1087,
 1094,
 1098,
 1103,
 1104,
 1132,
 1140,
 1141,
 1149,
 1154,
 1163,
 1177,
 1184,
 1193,
 1204,
 1208,
 1227,
 1240,
 1245,
 1266,
 1273,
 1286,
 1289,
 1307,
 1310,
 1312,
 1316,
 1334,
 1338,
 1339,
 1340,
 1345,
 1350,
 1351,
 1356,
 1364,
 1370,
 1377,
 1383,
 1413,
 1449,
 1468,
 1475,
 1491,
 1497,


In [47]:
p_and_e_data = pe_df[pe_df['matches_group_id'].isin(pe_groups)]
p_and_e_data

Unnamed: 0,record_index,MMS_ID,Title,OCN,ISSN,Related_OCNs,Related_ISSNs,Vol_nos,Gov_doc_nos,OCN_cluster,ISSN_cluster,p_or_e,ISSN_group_id,OCN_group_id,both_groups,matches_group_id,ISSN_to_match
0,87349,9935224330001701,Publishers' world.,[988619],[0555-6384],"[2489456, 567791231, 1695359]",[0000-0019],[],[],"[567791231, 1695359, 988619, 2489456]","[0000-0019, 0555-6384]",p,[0i],[35959o],"[0i, 35959o]",0,0555-6384
1,105258,9967008940001701,Publishers weekly,[37309426],[2150-4008],[],[0000-0019],[],[],[37309426],"[2150-4008, 0000-0019]",e,[0i],[35959o],"[0i, 35959o]",0,2150-4008
2,58136,9913446020001701,Publishers weekly yearbook,[9604938],[0000-0469],[],[0000-0019],[],[],[9604938],"[0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0,0000-0469
3,60626,9934112930001701,Publishers weekly,[2489456],[0000-0019],"[37309426, 9604938]","[0000-0019, 000--0019, 0000-0469, 2150-4008]",[],[],"[37309426, 9604938, 2489456]","[2150-4008, 0000-0469, 0000-0019]",p,[0i],[35959o],"[0i, 35959o]",0,0000-0019
4,52478,9937257820001701,The Book publishing annual,[1114932096],[0000-0787],[],[0000-0019],[],[],[1114932096],"[0000-0787, 0000-0019]",p,[0i],[9122o],"[0i, 9122o]",0,0000-0787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129003,2519,9939323230001701,UNLV gaming law journal.,[666937502],[],[],[],[],[],[666937502],[],p,[48442i],[99928o],"[48442i, 99928o]",101650,
129020,21367,9935136780001701,Northern history,[1760664],[0078-172X],[],[],[],[],[1760664],[0078-172X],p,[9994i],[40621o],"[9994i, 40621o]",101667,0078-172X
129021,105065,9969162920001701,Northern history.,[679734406],[1745-8706],[],[0078-172X],[],[],[679734406],"[0078-172X, 1745-8706]",e,[9994i],[100478o],"[9994i, 100478o]",101667,1745-8706
129074,114862,9977101571201701,Oriental insects,[668436876],[2157-8745],[],[0030-5316],[],[],[668436876],"[0030-5316, 2157-8745]",e,[5104i],[99982o],"[5104i, 99982o]",101697,2157-8745


In [48]:
p_and_e_data.to_pickle('p_and_e_' + today + '.pkl')
#p_and_e_data.to_csv('p_and_e_' + today + '.txt',sep='\t',index=False)