This notebook cleans, transforms, and formats metadata for multipart monographs for HathiTrust full catalog. Before running this notebook, the metadata must be extracted from an Alma-published MARC file and saved as a .pkl file. The .txt file produced at the end of this notebook is ready for posting to HathiTrust.

In [92]:
import pandas as pd
import numpy as np
import re

In [93]:
#change filename if necessary
mpm = pd.read_pickle('mpm_df.pkl')
mpm

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1
0,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0
...,...,...,...,...,...,...,...,...,...,...,...
0,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0
0,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0
0,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0
0,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0


In [94]:
#count number of items in 'lmich' df to make sure drop was correct
lmich = mpm[mpm['perm_lib'].str.contains('lmich|DTWED|DAILR|DNRRI|DCED', case=False)]
lmich

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag
0,9961701910001701,(OCoLC)16115461,94618-1001,BOOK,no. 87(2),DNRRI,UMDNR,WDN,,,1
0,9961683090001701,(OCoLC)1086922,23876-1001,BOOK,no.22,DNRRI,UMDNR,WDN,,,0
0,9947254770001701,(OCoLC)9963703,75686-1001,BOOK,no. 82-25,DNRRI,UMDNR,WDN,,,0
0,9947256650001701,(OCoLC)10380587,77382-1001,BOOK,no.6634,DNRRI,UMDNR,WDN,,,1
0,9947248680001701,(OCoLC)10359985,77302-1001,BOOK,no.6394,DNRRI,UMDNR,WDN,,,1
...,...,...,...,...,...,...,...,...,...,...,...
0,9935335280001701,(OCoLC)4055201,49843-1001,BOOK,no.1223,DNRRI,UMDNR,WDN,,,1
0,9935337690001701,(OCoLC)4043958,49805-1001,BOOK,no.312,DNRRI,UMDNR,WDN,,,0
0,9935321560001701,(OCoLC)23603585,31953000852627N,BOOK,no.1342,DCED,UMDCB,WDN,,,0
0,9943714830001701,(OCoLC)4421761,51444-1001,BOOK,no.51,DNRRI,UMDNR,WDN,,,0


In [95]:
#insert code to drop libraries we don't want since I didn't do it within Alma this time
dropped_lmich = mpm[~mpm['perm_lib'].str.contains('lmich|DTWED|DAILR|DNRRI|DCED', case=False)]
dropped_lmich

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1
0,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0
...,...,...,...,...,...,...,...,...,...,...,...
0,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0
0,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0
0,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0
0,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0


In [96]:
#if count is correct, then change dropped_lmich back to mpm
mpm = dropped_lmich

In [97]:
#count number of items in 'lmich' df to make sure drop was correct
badzmlac = mpm[mpm['perm_loc'].str.contains('CARN|MANN|MPL|MPLN|NONX|SLLN|SPP|SPPN|DCED|DNRRI|DAILR|DTWED', case=False)]
badzmlac

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag
0,9946961320001701,(OCoLC)1577866,31951M01066583L,BOOK,v. 2,ZMLAC,MPL,MPL,,"and,mpl",0
0,9946980730001701,(OCoLC)1597437,31951M01134237P,BOOK,v. 2,ZMLAC,MPL,MPL,,"and,mpl",0
0,9946963870001701,(OCoLC)15058614,31951M011328923,BOOK,v. 2,ZMLAC,MPL,MPL,,"and,mpl",0
0,9946964700001701,(OCoLC)1675917,31951M01125188D,BOOK,v. 2,ZMLAC,MPL,MPL,,"and,mpl",0
0,9946969870001701,(OCoLC)1596017,31951M01061262U,BOOK,v. 2,ZMLAC,MPL,MPL,,"and,mpl",0
...,...,...,...,...,...,...,...,...,...,...,...
0,9946537370001701,(OCoLC)123327,31951M01205507S,BOOK,v.1,ZMLAC,MPL,MPL,,,0
0,9918629100001701,(OCoLC)369721,31951M01195185D,BOOK,v. 1,ZMLAC,MPL,MPL,,,0
0,9918625970001701,(OCoLC)6248323,31951M011710051,BOOK,v. 3,ZMLAC,MPL,MPL,,,0
0,9958427850001701,(OCoLC)551234,31951M01240840J,BOOK,v.1,ZMLAC,MPL,MPL,,,0


In [98]:
#drop items in zmlac locations that need to be excluded
dropped_badzmlac = mpm[~mpm['perm_loc'].str.contains('CARN|MANN|MPL|MPLN|NONX|SLLN|SPP|SPPN|DCED|DNRRI|DAILR|DTWED', case=False)]
dropped_badzmlac

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1
0,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0
...,...,...,...,...,...,...,...,...,...,...,...
0,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0
0,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0
0,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0
0,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0


In [99]:
#if count is correct, then change dropped_lmich back to mpm
mpm = dropped_badzmlac

In [100]:
#replace empty values with NaN
mpm2 = mpm.apply(lambda x: x.str.strip()).replace('', np.nan)
mpm2

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1
0,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1
0,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0
...,...,...,...,...,...,...,...,...,...,...,...
0,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0
0,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0
0,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0
0,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0


In [101]:
#select a dataframe containing only records with data in internal_note1
inotes = mpm2[mpm2['internal_note1'].notnull()]
inotes

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag
0,9947042610001701,(OCoLC)3976387,31951001182818P,BOOK,v.2,TWILS,CLS,CLS,,too brittle to bind - 10-89,0
0,9941795980001701,(OCoLC)185374,31953000030537U,BOOK,v.5-6,ZMLAC,UMD,UMD,,auto,0
0,9941795980001701,(OCoLC)185374,31953000030539Q,BOOK,v.9-10,ZMLAC,UMD,UMD,,auto,0
0,9941795980001701,(OCoLC)185374,31953000030545V,BOOK,v.16,ZMLAC,UMD,UMD,,auto,0
0,9947035710001701,(OCoLC)52016436,31951000156891L,BOOK,pt.2,ZMLAC,GEN,GEN,,stp,0
...,...,...,...,...,...,...,...,...,...,...,...
0,9912940300001701,(OCoLC)3844319,31951000059755L,BOOK,pt.A,ZMLAC,GEN,GEN,,auto,0
0,9912927890001701,(OCoLC)915014,31951002113466N,BOOK,v.2,ZMLAC,GEN,GEN,,auto,0
0,9912926740001701,(OCoLC)13097134,31951000888030W,BOOK,v.2,ZMLAC,GEN,GEN,,wils,0
0,9912926560001701,(OCoLC)1825999,31951000887587O,BOOK,v.2,ZMLAC,GEN,GEN,,walt,0


In [102]:
#select a dataframe containing only records matching text strings denoting damage
brittle = inotes[inotes['internal_note1'].str.contains('brittle|damage|deteriorat', case=False)]
brittle

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag
0,9947042610001701,(OCoLC)3976387,31951001182818P,BOOK,v.2,TWILS,CLS,CLS,,too brittle to bind - 10-89,0
0,9961719240001701,(OCoLC)3367026,31951001602303Z,BOOK,v.2,TWILS,GEN,WDN,TECHNICAL,"nos 120919, wd or replace c.1, c1 is in fair c...",0
0,9957974460001701,(OCoLC)2866547,31951P00279245K,BOOK,v.2,TSCI,GEN,GEN,,Microclimate Box -- Brittle Item / MORE BCODES...,0
0,9946346170001701,(OCoLC)1735102,31951T00359521N,BOOK,pt.1,TWILS,GOVU,WDN,,... Operation Babylift & Humanitarian Needs: H...,1
0,9942094520001701,(OCoLC)30689582,31951T00050578O,BOOK,v.1:pt.2-4,TLAW,GEN,WDN,,"more barcodes:31951D005991720; damaged, withdr...",0
...,...,...,...,...,...,...,...,...,...,...,...
0,9934245670001701,(OCoLC)6028518,31951P00279302Y,BOOK,v.3:pt.1,TSCI,GEN,GEN,,Microclimate box--brittle item / MORE BCODES:3...,0
0,9916087230001701,(OCoLC)5068456,31951001024235A,BOOK,text 1,TWILS,GEN,WDN,TECHNICAL,water damaged 2/07,0
0,9943915870001701,(OCoLC)30767997,31951P00315725J,BOOK,v.2,TSCI,GEN,GEN,,Walter Serials Processing--Brittle Item / MORE...,0
0,9973346005801701,(OCoLC)62381048,31951D02049403K,BOOK,v.2,TLAW,GENW,WDN,,"damaged/incomplete, withdrawn;",0


In [103]:
#add a column with condition indicator
brittle2 = brittle.assign(condition='BRT')
brittle2

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition
0,9947042610001701,(OCoLC)3976387,31951001182818P,BOOK,v.2,TWILS,CLS,CLS,,too brittle to bind - 10-89,0,BRT
0,9961719240001701,(OCoLC)3367026,31951001602303Z,BOOK,v.2,TWILS,GEN,WDN,TECHNICAL,"nos 120919, wd or replace c.1, c1 is in fair c...",0,BRT
0,9957974460001701,(OCoLC)2866547,31951P00279245K,BOOK,v.2,TSCI,GEN,GEN,,Microclimate Box -- Brittle Item / MORE BCODES...,0,BRT
0,9946346170001701,(OCoLC)1735102,31951T00359521N,BOOK,pt.1,TWILS,GOVU,WDN,,... Operation Babylift & Humanitarian Needs: H...,1,BRT
0,9942094520001701,(OCoLC)30689582,31951T00050578O,BOOK,v.1:pt.2-4,TLAW,GEN,WDN,,"more barcodes:31951D005991720; damaged, withdr...",0,BRT
...,...,...,...,...,...,...,...,...,...,...,...,...
0,9934245670001701,(OCoLC)6028518,31951P00279302Y,BOOK,v.3:pt.1,TSCI,GEN,GEN,,Microclimate box--brittle item / MORE BCODES:3...,0,BRT
0,9916087230001701,(OCoLC)5068456,31951001024235A,BOOK,text 1,TWILS,GEN,WDN,TECHNICAL,water damaged 2/07,0,BRT
0,9943915870001701,(OCoLC)30767997,31951P00315725J,BOOK,v.2,TSCI,GEN,GEN,,Walter Serials Processing--Brittle Item / MORE...,0,BRT
0,9973346005801701,(OCoLC)62381048,31951D02049403K,BOOK,v.2,TLAW,GENW,WDN,,"damaged/incomplete, withdrawn;",0,BRT


In [104]:
brittle3 = brittle2[['barcode', 'condition']]
brittle3

Unnamed: 0,barcode,condition
0,31951001182818P,BRT
0,31951001602303Z,BRT
0,31951P00279245K,BRT
0,31951T00359521N,BRT
0,31951T00050578O,BRT
...,...,...
0,31951P00279302Y,BRT
0,31951001024235A,BRT
0,31951P00315725J,BRT
0,31951D02049403K,BRT


In [105]:
mpm3 = pd.merge(left=mpm2, right=brittle3, how='left', on='barcode')
mpm3

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1,
1,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1,
2,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1,
3,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1,
4,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
775590,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0,
775591,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0,
775592,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0,
775593,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0,


In [106]:
mpm3[mpm3['condition'].notnull()]

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition
14,9947042610001701,(OCoLC)3976387,31951001182818P,BOOK,v.2,TWILS,CLS,CLS,,too brittle to bind - 10-89,0,BRT
2297,9961719240001701,(OCoLC)3367026,31951001602303Z,BOOK,v.2,TWILS,GEN,WDN,TECHNICAL,"nos 120919, wd or replace c.1, c1 is in fair c...",0,BRT
3282,9957974460001701,(OCoLC)2866547,31951P00279245K,BOOK,v.2,TSCI,GEN,GEN,,Microclimate Box -- Brittle Item / MORE BCODES...,0,BRT
7806,9946346170001701,(OCoLC)1735102,31951T00359521N,BOOK,pt.1,TWILS,GOVU,WDN,,... Operation Babylift & Humanitarian Needs: H...,1,BRT
8464,9942094520001701,(OCoLC)30689582,31951T00050578O,BOOK,v.1:pt.2-4,TLAW,GEN,WDN,,"more barcodes:31951D005991720; damaged, withdr...",0,BRT
...,...,...,...,...,...,...,...,...,...,...,...,...
741534,9934245670001701,(OCoLC)6028518,31951P00279302Y,BOOK,v.3:pt.1,TSCI,GEN,GEN,,Microclimate box--brittle item / MORE BCODES:3...,0,BRT
745285,9916087230001701,(OCoLC)5068456,31951001024235A,BOOK,text 1,TWILS,GEN,WDN,TECHNICAL,water damaged 2/07,0,BRT
747750,9943915870001701,(OCoLC)30767997,31951P00315725J,BOOK,v.2,TSCI,GEN,GEN,,Walter Serials Processing--Brittle Item / MORE...,0,BRT
754123,9973346005801701,(OCoLC)62381048,31951D02049403K,BOOK,v.2,TLAW,GENW,WDN,,"damaged/incomplete, withdrawn;",0,BRT


In [107]:
wdn = mpm3[(mpm3['perm_loc'] == 'WDN') | (mpm3['curr_loc'] == 'WDN')]
wdn

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition
8,9947056480001701,(OCoLC)229815,319530000084762,BOOK,v.3,DUMD,UMDBK,WDN,TECHNICAL,,0,
9,9947055650001701,(OCoLC)197985,31953000022802X,BOOK,no.4,DUMD,UMDBK,WDN,TECHNICAL,,0,
59,9947008500001701,(OCoLC)3273753,31951001218685N,BOOK,v.1,TWILS,GEN,WDN,TECHNICAL,,0,
98,9949254370001701,(OCoLC)713765739,31951P007710350,BOOK,v.2,TWILS,CLS,WDN,TECHNICAL,MORE BCODES:31951002176416N,0,
113,9949254900001701,(OCoLC)152551416,319510021773942,BOOK,v.3,TWILS,AME,WDN,TECHNICAL,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
775423,9960545710001701,(OCoLC)237723,31953000581577K,BOOK,v.2,DUMD,UMDBK,WDN,TECHNICAL,,0,
775480,9960517080001701,(OCoLC)9971044,31951D001409711,BOOK,v.5,TWILS,GEN,WDN,,,0,
775502,9912957220001701,(OCoLC)17505224,31951002352230J,BOOK,v.1,TWILS,CLS,WDN,TECHNICAL,,0,
775550,9912933050001701,(OCoLC)4211300,31951001016122J,BOOK,v. 3,TWILS,GEN,WDN,TECHNICAL,,0,


In [108]:
wdn2 = wdn.assign(holding_status='WD')
wdn2

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
8,9947056480001701,(OCoLC)229815,319530000084762,BOOK,v.3,DUMD,UMDBK,WDN,TECHNICAL,,0,,WD
9,9947055650001701,(OCoLC)197985,31953000022802X,BOOK,no.4,DUMD,UMDBK,WDN,TECHNICAL,,0,,WD
59,9947008500001701,(OCoLC)3273753,31951001218685N,BOOK,v.1,TWILS,GEN,WDN,TECHNICAL,,0,,WD
98,9949254370001701,(OCoLC)713765739,31951P007710350,BOOK,v.2,TWILS,CLS,WDN,TECHNICAL,MORE BCODES:31951002176416N,0,,WD
113,9949254900001701,(OCoLC)152551416,319510021773942,BOOK,v.3,TWILS,AME,WDN,TECHNICAL,,0,,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775423,9960545710001701,(OCoLC)237723,31953000581577K,BOOK,v.2,DUMD,UMDBK,WDN,TECHNICAL,,0,,WD
775480,9960517080001701,(OCoLC)9971044,31951D001409711,BOOK,v.5,TWILS,GEN,WDN,,,0,,WD
775502,9912957220001701,(OCoLC)17505224,31951002352230J,BOOK,v.1,TWILS,CLS,WDN,TECHNICAL,,0,,WD
775550,9912933050001701,(OCoLC)4211300,31951001016122J,BOOK,v. 3,TWILS,GEN,WDN,TECHNICAL,,0,,WD


In [109]:
wdn3 = wdn2[['barcode', 'holding_status']]
wdn3

Unnamed: 0,barcode,holding_status
8,319530000084762,WD
9,31953000022802X,WD
59,31951001218685N,WD
98,31951P007710350,WD
113,319510021773942,WD
...,...,...
775423,31953000581577K,WD
775480,31951D001409711,WD
775502,31951002352230J,WD
775550,31951001016122J,WD


In [110]:
mpm4 = pd.merge(left=mpm3, right=wdn3, how='left', on='barcode')
mpm4

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1,,
1,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1,,
2,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1,,
3,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1,,
4,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775590,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0,,
775591,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0,,
775592,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0,,
775593,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0,,


In [111]:
mpm4[mpm4['holding_status'].notnull()]

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
8,9947056480001701,(OCoLC)229815,319530000084762,BOOK,v.3,DUMD,UMDBK,WDN,TECHNICAL,,0,,WD
9,9947055650001701,(OCoLC)197985,31953000022802X,BOOK,no.4,DUMD,UMDBK,WDN,TECHNICAL,,0,,WD
59,9947008500001701,(OCoLC)3273753,31951001218685N,BOOK,v.1,TWILS,GEN,WDN,TECHNICAL,,0,,WD
98,9949254370001701,(OCoLC)713765739,31951P007710350,BOOK,v.2,TWILS,CLS,WDN,TECHNICAL,MORE BCODES:31951002176416N,0,,WD
113,9949254900001701,(OCoLC)152551416,319510021773942,BOOK,v.3,TWILS,AME,WDN,TECHNICAL,,0,,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775423,9960545710001701,(OCoLC)237723,31953000581577K,BOOK,v.2,DUMD,UMDBK,WDN,TECHNICAL,,0,,WD
775480,9960517080001701,(OCoLC)9971044,31951D001409711,BOOK,v.5,TWILS,GEN,WDN,,,0,,WD
775502,9912957220001701,(OCoLC)17505224,31951002352230J,BOOK,v.1,TWILS,CLS,WDN,TECHNICAL,,0,,WD
775550,9912933050001701,(OCoLC)4211300,31951001016122J,BOOK,v. 3,TWILS,GEN,WDN,TECHNICAL,,0,,WD


In [112]:
proctype = mpm4[mpm4['process_type'].notnull()]
proctype

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
8,9947056480001701,(OCoLC)229815,319530000084762,BOOK,v.3,DUMD,UMDBK,WDN,TECHNICAL,,0,,WD
9,9947055650001701,(OCoLC)197985,31953000022802X,BOOK,no.4,DUMD,UMDBK,WDN,TECHNICAL,,0,,WD
59,9947008500001701,(OCoLC)3273753,31951001218685N,BOOK,v.1,TWILS,GEN,WDN,TECHNICAL,,0,,WD
98,9949254370001701,(OCoLC)713765739,31951P007710350,BOOK,v.2,TWILS,CLS,WDN,TECHNICAL,MORE BCODES:31951002176416N,0,,WD
113,9949254900001701,(OCoLC)152551416,319510021773942,BOOK,v.3,TWILS,AME,WDN,TECHNICAL,,0,,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775502,9912957220001701,(OCoLC)17505224,31951002352230J,BOOK,v.1,TWILS,CLS,WDN,TECHNICAL,,0,,WD
775513,9912959700001701,(OCoLC)2275739,31951002352055B,BOOK,v.1,ZMLAC,GEN,GEN,WORK_ORDER_DEPARTMENT,,0,,
775531,9912931970001701,(OCoLC)73555262,31951D026840861,BOOK,v.1,TMUSI,GEN,GEN,LOAN,,0,,
775550,9912933050001701,(OCoLC)4211300,31951001016122J,BOOK,v. 3,TWILS,GEN,WDN,TECHNICAL,,0,,WD


In [113]:
lost_msg = proctype[((proctype['process_type'].str.contains('missing|lost', case=False)) & (proctype['holding_status'].isnull())) | 
                    ((proctype['internal_note1'].str.contains('missing|lost', case=False)) & (proctype['holding_status'].isnull()))]
lost_msg

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
826,9928862370001701,(OCoLC)10558661,319510001216392,BOOK,v.2,TBIOM,GEN,GEN,MISSING,,0,,
2766,9948357770001701,(OCoLC)530321,31951000166922U,BOOK,v.1,TCOS,SN2,SN2,MISSING,,0,,
12851,9925640230001701,(OCoLC)133811,31951000479730X,BOOK,v.1,ZMLAC,GEN,GEN,MISSING,"Never accessioned at MLAC. Toggled to ""Missing...",0,,
18370,9913234850001701,(OCoLC)2663373,31951D00475903Q,BOOK,v.2,TLAW,GEN,GEN,MISSING,$115 lost book fee transferred to student acco...,0,,
19883,9940670520001701,(OCoLC)10723217,31956000227690,BOOK,v.2,MBRIG,GEN,GEN,MISSING,Z7963.A75 T84 1984 v.2,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
762591,9953425320001701,(OCoLC)26030861,31951D00750410T,BOOK,v.1-10,ZMLAC,GEN,GEN,MISSING,,0,,
767663,9925924940001701,(OCoLC)191927868,31956004912347,BOOK,v.1,MBRIG,GEN,GEN,LOST_LOAN,,0,,
770661,9955763450001701,(OCoLC)28974907,31951D01035914T,BOOK,v.1-4,ZMLAC,GEN,GEN,MISSING,auto,0,,
772742,9952500290001701,(OCoLC)1039528914,319530011925619,BOOK,v.7,DUMD,UMDBK,UMDBK,LOST_LOAN,,0,,


In [114]:
lost_msg2 = lost_msg.assign(holding_status='LM')
lost_msg2

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
826,9928862370001701,(OCoLC)10558661,319510001216392,BOOK,v.2,TBIOM,GEN,GEN,MISSING,,0,,LM
2766,9948357770001701,(OCoLC)530321,31951000166922U,BOOK,v.1,TCOS,SN2,SN2,MISSING,,0,,LM
12851,9925640230001701,(OCoLC)133811,31951000479730X,BOOK,v.1,ZMLAC,GEN,GEN,MISSING,"Never accessioned at MLAC. Toggled to ""Missing...",0,,LM
18370,9913234850001701,(OCoLC)2663373,31951D00475903Q,BOOK,v.2,TLAW,GEN,GEN,MISSING,$115 lost book fee transferred to student acco...,0,,LM
19883,9940670520001701,(OCoLC)10723217,31956000227690,BOOK,v.2,MBRIG,GEN,GEN,MISSING,Z7963.A75 T84 1984 v.2,0,,LM
...,...,...,...,...,...,...,...,...,...,...,...,...,...
762591,9953425320001701,(OCoLC)26030861,31951D00750410T,BOOK,v.1-10,ZMLAC,GEN,GEN,MISSING,,0,,LM
767663,9925924940001701,(OCoLC)191927868,31956004912347,BOOK,v.1,MBRIG,GEN,GEN,LOST_LOAN,,0,,LM
770661,9955763450001701,(OCoLC)28974907,31951D01035914T,BOOK,v.1-4,ZMLAC,GEN,GEN,MISSING,auto,0,,LM
772742,9952500290001701,(OCoLC)1039528914,319530011925619,BOOK,v.7,DUMD,UMDBK,UMDBK,LOST_LOAN,,0,,LM


In [115]:
mpm4.update(lost_msg2, overwrite=False)
mpm4

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1,,
1,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1,,
2,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1,,
3,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1,,
4,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775590,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0,,
775591,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0,,
775592,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0,,
775593,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0,,


In [116]:
mpm4[mpm4['holding_status'] == 'LM']


Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
826,9928862370001701,(OCoLC)10558661,319510001216392,BOOK,v.2,TBIOM,GEN,GEN,MISSING,,0,,LM
2766,9948357770001701,(OCoLC)530321,31951000166922U,BOOK,v.1,TCOS,SN2,SN2,MISSING,,0,,LM
12851,9925640230001701,(OCoLC)133811,31951000479730X,BOOK,v.1,ZMLAC,GEN,GEN,MISSING,"Never accessioned at MLAC. Toggled to ""Missing...",0,,LM
18370,9913234850001701,(OCoLC)2663373,31951D00475903Q,BOOK,v.2,TLAW,GEN,GEN,MISSING,$115 lost book fee transferred to student acco...,0,,LM
19883,9940670520001701,(OCoLC)10723217,31956000227690,BOOK,v.2,MBRIG,GEN,GEN,MISSING,Z7963.A75 T84 1984 v.2,0,,LM
...,...,...,...,...,...,...,...,...,...,...,...,...,...
762591,9953425320001701,(OCoLC)26030861,31951D00750410T,BOOK,v.1-10,ZMLAC,GEN,GEN,MISSING,,0,,LM
767663,9925924940001701,(OCoLC)191927868,31956004912347,BOOK,v.1,MBRIG,GEN,GEN,LOST_LOAN,,0,,LM
770661,9955763450001701,(OCoLC)28974907,31951D01035914T,BOOK,v.1-4,ZMLAC,GEN,GEN,MISSING,auto,0,,LM
772742,9952500290001701,(OCoLC)1039528914,319530011925619,BOOK,v.7,DUMD,UMDBK,UMDBK,LOST_LOAN,,0,,LM


In [117]:
currhol = mpm4[mpm4['holding_status'].isnull()]
currhol

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1,,
1,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1,,
2,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1,,
3,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1,,
4,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775590,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0,,
775591,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0,,
775592,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0,,
775593,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0,,


In [118]:
currhol2 = currhol.assign(holding_status='CH')
currhol2

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1,,CH
1,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1,,CH
2,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1,,CH
3,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1,,CH
4,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0,,CH
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775590,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0,,CH
775591,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0,,CH
775592,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0,,CH
775593,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0,,CH


In [119]:
mpm4.update(currhol2, overwrite=False)
mpm4

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1,,CH
1,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1,,CH
2,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1,,CH
3,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1,,CH
4,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0,,CH
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775590,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0,,CH
775591,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0,,CH
775592,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0,,CH
775593,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0,,CH


In [120]:
mpm4['holding_status'].unique()

array(['CH', 'WD', 'LM'], dtype=object)

In [121]:
#if the resulting dataframe is not empty, need to add logic to clean up invalid library codes
weird_lib = mpm4[mpm4['perm_lib'].apply(lambda x: len(str(x)) > 5)]
weird_lib

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status


In [122]:
mpm4['perm_lib'].unique()

array(['TWILS', 'DUMD', 'TLAW', 'ZMLAC', 'TMUSI', 'TSCI', 'TBIOM',
       'TMAGR', 'TARCH', 'TVET', 'MBRIG', 'TCOS', 'TAND', 'TBWAN',
       'TLAKE', 'TNRL', 'TAHL', 'TMATH', 'CUMC', 'TJOUR', 'RUMR', 'TFOR'],
      dtype=object)

In [123]:
#use this cell to query any weird lib for more information
#dnrri = mpm4[mpm4['perm_lib'].str.contains('dnrri', case=False)]
#dnrri

In [124]:
#if the resulting dataframe is not empty, need to add logic to clean up multiple OCNs
double_ocn = mpm4[mpm4['OCN'].str.contains(';')]
double_ocn

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status


In [125]:
#if the resulting dataframe is not empty, need to add logic to clean up OCNs with issues
bad_ocn = mpm4[(mpm4['OCN'].str.contains(' ')) | (mpm4['OCN'].str.contains('\s')) | (mpm4['OCN'].str.len() > 19)]
bad_ocn

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status


In [126]:
#look to see if there any OCLC# less than 8 characters including (OCoLC)
short_ocns = mpm4[mpm4['OCN'].str.len() < 9]
short_ocns

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status


In [127]:
text_check = re.compile('(\(OCoLC\))[^0-9]')
first_zero = re.compile('(\(OCoLC\))0')
fix_ocns = mpm4[(mpm4['OCN'].str.match(text_check)) | (mpm4['OCN'].str.match(first_zero))]
fix_ocns

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status


In [128]:
#if the resulting dataframe is not empty, need to add logic to clean up multiple MMS ID values
mmsid_check = re.compile('\D')
multi_mmsid = mpm4[mpm4['MMS ID'].str.match(mmsid_check)]
multi_mmsid

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status


In [129]:
no_description = mpm4[mpm4['description'].isnull()]
no_description

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status


In [130]:
#find items that are described as CD, CD-ROM, or DVD
disc_desc = mpm4[mpm4['description'].str.contains('CD|DVD|CD-ROM')]
disc_desc

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
784,9928899500001701,(OCoLC)33335400,31951D01186716R,BOOK,CD-4613,TMUSI,GEN,GEN,,,0,,CH
6897,9927816030001701,(OCoLC)47112155,31956004600595,BOOK,CD-ROM T.E.,MBRIG,TED,TED,,,0,,CH
8317,9965861500001701,(OCoLC)48040880,31951D02110077R,BOOK,CD,ZMLAC,GEN,WDN,TECHNICAL,,0,,WD
8827,9926713780001701,(OCoLC)300404759,31951T00393922B,BOOK,DVD,TWILS,GEN,GEN,,,0,,CH
15456,9943049980001701,(OCoLC)44929485,31951D02317312U,BOOK,CD-ROM 2003,TWILS,GEN,GEN,,,0,,CH
...,...,...,...,...,...,...,...,...,...,...,...,...,...
758710,9915047720001701,(OCoLC)45049524,31951D02340789W,BOOK,bd.2:CD,TWILS,GEN,GEN,,,0,,CH
759014,9957009160001701,(OCoLC)43245679,31951D016560138,BOOK,CD-ROM,TARCH,GEN,WDN,TECHNICAL,,0,,WD
760932,9927403320001701,(OCoLC)37947378,31951D01582549C,BOOK,CD,TMUSI,GEN,GEN,,,0,,CH
771891,9939416670001701,(OCoLC)38878696,31951D01582599X,BOOK,CD,TMUSI,GEN,GEN,,,0,,CH


In [131]:
#Drop cds and dvds. Note the number of rows from the command above with currhol2, and compare how many discs
#were in the mpm4 df and then how many are in disc_drop. It should be mpm4 - disc_desc = disc_drop. 
#if so, proceed to make mpm4 = disc_drop
disc_drop = mpm4[~mpm4.description.str.contains('CD|DVD|CD-ROM')]
disc_drop

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1,,CH
1,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1,,CH
2,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1,,CH
3,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1,,CH
4,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0,,CH
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775590,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0,,CH
775591,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0,,CH
775592,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0,,CH
775593,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0,,CH


In [132]:
mpm4 = disc_drop

In [133]:
#HT says long description error doesn't prevent matching, it's just for user information
long_desc = mpm4[mpm4['description'].str.len() > 60]
long_desc

Unnamed: 0,MMS ID,OCN,barcode,material type,description,perm_lib,perm_loc,curr_loc,process_type,internal_note1,gov_doc_flag,condition,holding_status
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1,,CH
12643,9949302950001701,(OCoLC)8122085,31951D024629772,BOOK,11A (Transportation Systems Inland Waterways -...,TWILS,GOVU,GOVU,,,1,,CH
26154,9949302950001701,(OCoLC)8122085,31951D02462990A,BOOK,2L (German Military Government over Europe: th...,TWILS,GOVU,GOVU,,,1,,CH
26156,9949302950001701,(OCoLC)8122085,31951D02462979Y,BOOK,10E&F (Public Works and Utilities: Urban Trans...,TWILS,GOVU,GOVU,,,1,,CH
33816,9973120460001701,(OCoLC)7525652,31951D03747468H,BOOK,v.6:pt.1 (Selected Service Trades - Area Stati...,TWILS,GOVX,GOVX,,,1,,CH
88283,9949680240001701,(OCoLC)5204825,31958000768921,BOOK,Inscribed to Dustin Bergh by Conrad G Servig. ...,CUMC,ARCV,ARCV,,31958000803686 old barcode,0,,CH
103599,9932656890001701,(OCoLC)2271968,31951SE1008461J,BOOK,"3 page summary on ""A Vision"", by William Butle...",TAND,MSSLT,MSSLT,,,0,,CH
112028,9924878560001701,(OCoLC)56011447,31958000765422,BOOK,Atlas kept in the Atlas case on the second flo...,CUMC,REF,REF,,,0,,CH
116630,9959263490001701,(OCoLC)156814680,31951D027976182,BOOK,v.1; displayed on top of short shelves in Gene...,TARCH,GEN,GEN,,,0,,CH
122646,9961289150001701,(OCoLC)359943584,31951D025650128,BOOK,Enterobacter sakazakii (Cronobacter spp.) in P...,TBIOM,GEN,WDN,,,0,,WD


In [134]:
mpm5 = mpm4.rename(index=str, columns={"OCN":"OCLC #", "MMS ID":"Partner's Local System ID", "condition":"Condition",
                                       "gov_doc_flag":"Government Documents Indicator", "description":"Item-specific enumeration and chronology",
                                      "holding_status":"Holding status"})
mpm5

Unnamed: 0,Partner's Local System ID,OCLC #,barcode,material type,Item-specific enumeration and chronology,perm_lib,perm_loc,curr_loc,process_type,internal_note1,Government Documents Indicator,Condition,Holding status
0,9947053940001701,(OCoLC)179375,31951T004790751,BOOK,v.1:pt.1 (New England Middle Atlantic East Nor...,TWILS,GOVX,GOVX,,,1,,CH
1,9947053940001701,(OCoLC)179375,31951T00479077X,BOOK,v.1:pt.6 (Mountain and Pacific Statistics for ...,TWILS,GOVX,GOVX,,,1,,CH
2,9947053940001701,(OCoLC)179375,31951D034881172,BOOK,v.2:pt.3 (Western States Value of Farm Products),TWILS,GOVX,GOVX,,,1,,CH
3,9947053940001701,(OCoLC)179375,31951D00949719I,BOOK,v.3 (General Report Statistics by Subject),TWILS,GOVX,GOVX,,,1,,CH
4,9947053360001701,(OCoLC)180935,31953000025419H,BOOK,T.2,DUMD,UMDBK,UMDBK,,,0,,CH
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775590,9912926190001701,(OCoLC)3871681,319510010159927,BOOK,v. 4,TWILS,GEN,GEN,,,0,,CH
775591,9912926190001701,(OCoLC)3871681,319510010159943,BOOK,v. 6,TWILS,GEN,GEN,,,0,,CH
775592,9912930790001701,(OCoLC)21155156,31953000123974S,BOOK,v.1,DUMD,UMDBK,UMDBK,,,0,,CH
775593,9912930450001701,(OCoLC)1067046,31951000887687K,BOOK,v.6,ZMLAC,GEN,GEN,,wils,0,,CH


In [135]:
mpm5.columns

Index(['Partner's Local System ID', 'OCLC #', 'barcode', 'material type',
       'Item-specific enumeration and chronology', 'perm_lib', 'perm_loc',
       'curr_loc', 'process_type', 'internal_note1',
       'Government Documents Indicator', 'Condition', 'Holding status'],
      dtype='object')

In [136]:
mpm6 = mpm5[['OCLC #', 'Partner\'s Local System ID', 'Holding status', 'Condition',
             'Item-specific enumeration and chronology', 'Government Documents Indicator']]
mpm6

Unnamed: 0,OCLC #,Partner's Local System ID,Holding status,Condition,Item-specific enumeration and chronology,Government Documents Indicator
0,(OCoLC)179375,9947053940001701,CH,,v.1:pt.1 (New England Middle Atlantic East Nor...,1
1,(OCoLC)179375,9947053940001701,CH,,v.1:pt.6 (Mountain and Pacific Statistics for ...,1
2,(OCoLC)179375,9947053940001701,CH,,v.2:pt.3 (Western States Value of Farm Products),1
3,(OCoLC)179375,9947053940001701,CH,,v.3 (General Report Statistics by Subject),1
4,(OCoLC)180935,9947053360001701,CH,,T.2,0
...,...,...,...,...,...,...
775590,(OCoLC)3871681,9912926190001701,CH,,v. 4,0
775591,(OCoLC)3871681,9912926190001701,CH,,v. 6,0
775592,(OCoLC)21155156,9912930790001701,CH,,v.1,0
775593,(OCoLC)1067046,9912930450001701,CH,,v.6,0


In [137]:
#random multivolume checker just for validation purposes. This OCN should have four volumes and is not a gov doc
multi_vol = mpm6[mpm6['OCLC #'].str.contains('456656472')]
multi_vol

Unnamed: 0,OCLC #,Partner's Local System ID,Holding status,Condition,Item-specific enumeration and chronology,Government Documents Indicator
338320,(OCoLC)456656472,9930503240001701,CH,,v.2,0
338321,(OCoLC)456656472,9930503240001701,CH,,v.4,0
436267,(OCoLC)456656472,9930503240001701,CH,,v.3,0
503382,(OCoLC)456656472,9930503240001701,CH,,v.1,0


In [None]:
#update the date in the filenames before running this cell
mpm6.shape
mpm6.to_pickle('umn_multi-part_20200619.pkl')
mpm6.to_csv('umn_multi-part_20200619.txt', sep='\t', index=False)

In [None]:
mpm6["Partner's Local System ID"].nunique()

In [None]:
mpm6["OCLC #"].nunique()