# Validation of Two Data Sets

#### Import required libraries and data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
#https://data.cityofchicago.org/Public-Safety/Police-ANOV-Misdemeanor-Report-ANOVs/bi66-5gy5
df = pd.read_csv('Police_ANOV_Misdemeanor_Report_-_ANOVs.csv')

#### Explore Data Set and Create Two Sets - one with artificial errors

In [3]:
df.describe()

Unnamed: 0,STREET NUMBER,UNIT NUMBER,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards
count,649745.0,649745.0,559589.0,559588.0,561792.0,559916.0,559588.0
mean,3639.260941,98.917828,32.582227,41.853219,19245.651946,380.970687,23.532502
std,3500.278698,180.256228,17.734752,19.495181,5151.188757,235.844116,15.033917
min,0.0,0.0,1.0,1.0,2733.0,1.0,1.0
25%,800.0,7.0,19.0,27.0,21182.0,166.0,9.0
50%,3200.0,15.0,30.0,38.0,21559.0,368.0,23.0
75%,5800.0,191.0,48.0,61.0,22216.0,584.0,36.0
max,119000.0,933.0,61.0,77.0,26912.0,801.0,50.0


In [4]:
df.head(10)

Unnamed: 0,NOV NUMBER,ISSUED DATE,STREET NUMBER,STREET DIRECTION,STREET NAME,UNIT NUMBER,VIOLATION DESCRIPTION,LOCATION,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards
0,P004751507,03/31/2015,0,E,95TH ST,19,1,,,,,,
1,P004815950,08/06/2015,0,S,LOTUS,15,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
2,P004822018,08/25/2015,0,E,LAKE,211,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
3,P004937330,11/17/2015,0,W,CTA PLATFORM,701,1,,,,,,
4,P004937134,12/15/2015,0,W,0,701,1,,,,,,
5,P004978399,03/18/2016,0,W,95ST,701,1,,,,,,
6,P004949791,04/30/2016,500,N,LERAMIE,15,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
7,P005051389,06/16/2016,0,W,69TH ST,701,1,,,,,,
8,P004998736,09/27/2016,5400,W,CHICAGO,15,9-76-180 SAFETY BELTS.,,,,,,
9,P005149325,01/02/2017,0,E,ROOSEVELT,701,10-8-526/98-126#2.8 SMOKING ON CTA,,,,,,


In [5]:
#count freq of a value occuring in a column, including NULL rows
#https://stackoverflow.com/questions/22391433/count-the-frequency-that-a-value-occurs-in-a-dataframe-column
df['Wards'].value_counts(dropna=False)

NaN     90157
2.0     45820
36.0    38332
23.0    35516
4.0     32648
32.0    25287
14.0    22940
31.0    22564
9.0     20667
46.0    16827
37.0    16263
10.0    14461
39.0    14294
45.0    14162
13.0    14107
33.0    13927
25.0    13889
43.0    13734
3.0     12829
22.0    12599
35.0     9676
1.0      9379
47.0     8631
26.0     8000
8.0      7775
50.0     7491
48.0     7157
7.0      6867
30.0     6520
28.0     6519
41.0     6019
11.0     5811
5.0      5707
6.0      5480
49.0     5443
34.0     5303
16.0     3904
12.0     3873
21.0     3796
17.0     3761
20.0     3743
15.0     3552
40.0     3135
38.0     3118
18.0     2967
29.0     2734
44.0     2691
19.0     2668
24.0     2647
27.0     2540
42.0     1815
Name: Wards, dtype: int64

In [6]:
#python pandas show rows where one column is nan
df[df['Wards'].isnull()]

Unnamed: 0,NOV NUMBER,ISSUED DATE,STREET NUMBER,STREET DIRECTION,STREET NAME,UNIT NUMBER,VIOLATION DESCRIPTION,LOCATION,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards
0,P004751507,03/31/2015,0,E,95TH ST,19,1,,,,,,
1,P004815950,08/06/2015,0,S,LOTUS,15,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
2,P004822018,08/25/2015,0,E,LAKE,211,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
3,P004937330,11/17/2015,0,W,CTA PLATFORM,701,1,,,,,,
4,P004937134,12/15/2015,0,W,0,701,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
649730,P005940165,10/06/2020,0,W,69TH ST,543,10-8-526/016-110-1.18 SMOKING ON CTA (INCLUDI...,,,,,,
649734,P005930695,07/23/2020,0,E,ILLINOIS,18,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
649735,P005752161,09/07/2020,3100,E,78TH ST,4,10-36-185 VII-B 2. PARK CURFEW.(CHICAGO PAR...,,,,,,
649740,P005823923,08/25/2020,0,W,79TH ST,19,0000 - NON-MATCHING VIOLATION RECEIVED FROM EX...,,,,,,


In [7]:
df.columns

Index(['NOV NUMBER', 'ISSUED DATE', 'STREET NUMBER', 'STREET DIRECTION',
       'STREET NAME', 'UNIT NUMBER', 'VIOLATION DESCRIPTION', 'LOCATION',
       'Boundaries - ZIP Codes', 'Community Areas', 'Zip Codes',
       'Census Tracts', 'Wards'],
      dtype='object')

In [8]:
#panda count unique values in column
#https://stackoverflow.com/questions/45759966/counting-unique-values-in-a-column-in-pandas-dataframe-like-in-qlik
df['NOV NUMBER'].count()

649743

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649745 entries, 0 to 649744
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   NOV NUMBER              649743 non-null  object 
 1   ISSUED DATE             649745 non-null  object 
 2   STREET NUMBER           649745 non-null  int64  
 3   STREET DIRECTION        644122 non-null  object 
 4   STREET NAME             649745 non-null  object 
 5   UNIT NUMBER             649745 non-null  int64  
 6   VIOLATION DESCRIPTION   649745 non-null  object 
 7   LOCATION                561792 non-null  object 
 8   Boundaries - ZIP Codes  559589 non-null  float64
 9   Community Areas         559588 non-null  float64
 10  Zip Codes               561792 non-null  float64
 11  Census Tracts           559916 non-null  float64
 12  Wards                   559588 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 64.4+ MB


#### Create a 2nd Data Set and replace the 1's with I's

In [10]:
#https://sparkbyexamples.com/pandas/pandas-replace-substring-in-dataframe/?expand_article=1

df2 = df.replace('1','I', regex=True)

In [11]:
df2

Unnamed: 0,NOV NUMBER,ISSUED DATE,STREET NUMBER,STREET DIRECTION,STREET NAME,UNIT NUMBER,VIOLATION DESCRIPTION,LOCATION,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards
0,P00475I507,03/3I/20I5,0,E,95TH ST,19,I,,,,,,
1,P0048I5950,08/06/20I5,0,S,LOTUS,15,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
2,P0048220I8,08/25/20I5,0,E,LAKE,211,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
3,P004937330,II/I7/20I5,0,W,CTA PLATFORM,701,I,,,,,,
4,P004937I34,I2/I5/20I5,0,W,0,701,I,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
649740,P005823923,08/25/2020,0,W,79TH ST,19,0000 - NON-MATCHING VIOLATION RECEIVED FROM EX...,,,,,,
649741,P005889730,II/2I/2020,2200,S,MICHIGAN,1,7-28-227 CONSTRUCTION SITES-GARBAGE,POINT (-87.623802 4I.852652),40.0,34.0,21194.0,193.0,9.0
649742,P006005048,09/02/2020,0,W,95TH ST,701,I0-8-526/0I6-II0-I.I8 SMOKING ON CTA (INCLUDI...,,,,,,
649743,P005939260,07/28/2020,100,N,STATE ST,701,8-4-030(A) DRINKING IN PUBLIC WAY.,POINT (-87.62792I 4I.883328),41.0,38.0,14310.0,92.0,36.0


#### Replace Random # of rows from 1st Data with 2nd Data Set

In [12]:
#pandas dataframe replace random rows with rows from second data frame
#https://stackoverflow.com/questions/46450260/how-can-i-randomly-change-the-values-of-some-rows-in-a-pandas-dataframe
df3=df.copy(deep=True)
df3.update(df2.sample(frac=.25))
df3

Unnamed: 0,NOV NUMBER,ISSUED DATE,STREET NUMBER,STREET DIRECTION,STREET NAME,UNIT NUMBER,VIOLATION DESCRIPTION,LOCATION,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards
0,P00475I507,03/3I/20I5,0.0,E,95TH ST,19.0,I,,,,,,
1,P004815950,08/06/2015,0.0,S,LOTUS,15.0,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
2,P004822018,08/25/2015,0.0,E,LAKE,211.0,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
3,P004937330,11/17/2015,0.0,W,CTA PLATFORM,701.0,1,,,,,,
4,P004937134,12/15/2015,0.0,W,0,701.0,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
649740,P005823923,08/25/2020,0.0,W,79TH ST,19.0,0000 - NON-MATCHING VIOLATION RECEIVED FROM EX...,,,,,,
649741,P005889730,II/2I/2020,2200.0,S,MICHIGAN,1.0,7-28-227 CONSTRUCTION SITES-GARBAGE,POINT (-87.623802 4I.852652),40.0,34.0,21194.0,193.0,9.0
649742,P006005048,09/02/2020,0.0,W,95TH ST,701.0,10-8-526/016-110-1.18 SMOKING ON CTA (INCLUDI...,,,,,,
649743,P005939260,07/28/2020,100.0,N,STATE ST,701.0,8-4-030(A) DRINKING IN PUBLIC WAY.,POINT (-87.627921 41.883328),41.0,38.0,14310.0,92.0,36.0


### Join Original Dataset onto 'Corrupted' Dataset

#### In both Datasets's Column Headers - Replace spaces with underlines

In [13]:
#python dataframe column names replace space with underscore
df.columns

Index(['NOV NUMBER', 'ISSUED DATE', 'STREET NUMBER', 'STREET DIRECTION',
       'STREET NAME', 'UNIT NUMBER', 'VIOLATION DESCRIPTION', 'LOCATION',
       'Boundaries - ZIP Codes', 'Community Areas', 'Zip Codes',
       'Census Tracts', 'Wards'],
      dtype='object')

In [14]:
#https://stackoverflow.com/questions/65209035/renaming-column-names-from-a-data-set-in-pandas
#https://stackoverflow.com/questions/41476150/remove-or-replace-spaces-in-column-names
df.columns = [i.replace(' ','_') for i in df.columns]
df3.columns = [i.replace(' ','_') for i in df3.columns]

#### Update column names of 2nd dataset

In [15]:
df3.columns

Index(['NOV_NUMBER', 'ISSUED_DATE', 'STREET_NUMBER', 'STREET_DIRECTION',
       'STREET_NAME', 'UNIT_NUMBER', 'VIOLATION_DESCRIPTION', 'LOCATION',
       'Boundaries_-_ZIP_Codes', 'Community_Areas', 'Zip_Codes',
       'Census_Tracts', 'Wards'],
      dtype='object')

In [16]:
#list comprehension
['hyp_'+ str(col) for col in df.columns]

['hyp_NOV_NUMBER',
 'hyp_ISSUED_DATE',
 'hyp_STREET_NUMBER',
 'hyp_STREET_DIRECTION',
 'hyp_STREET_NAME',
 'hyp_UNIT_NUMBER',
 'hyp_VIOLATION_DESCRIPTION',
 'hyp_LOCATION',
 'hyp_Boundaries_-_ZIP_Codes',
 'hyp_Community_Areas',
 'hyp_Zip_Codes',
 'hyp_Census_Tracts',
 'hyp_Wards']

In [17]:
#pandas dataframe add prefix to column names
#https://stackoverflow.com/questions/34049618/how-to-add-a-suffix-or-prefix-to-each-column-name

df.columns = ['hyp_'+ str(col) for col in df.columns]
df3.columns = ['OAS_'+ str(col) for col in df3.columns]

#### Join the two Data Sets
- possibly outer join?
- 3rd set of columns will be indicator columns comparing Hyperion Column to OAS Column
- Final Data Set columns will be arranged:
    - Hyperion Columns
    - Comparison Columns
    - OAS Columns

In [18]:
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html
df_combo=df.merge(df3, how='outer',left_on='hyp_NOV_NUMBER', right_on='OAS_NOV_NUMBER')

In [19]:
#python pandas view all columns of dataframe
#https://towardsdatascience.com/how-to-show-all-columns-rows-of-a-pandas-dataframe-c49d4507fcf
pd.set_option('display.max_columns', None)
df_combo

Unnamed: 0,hyp_NOV_NUMBER,hyp_ISSUED_DATE,hyp_STREET_NUMBER,hyp_STREET_DIRECTION,hyp_STREET_NAME,hyp_UNIT_NUMBER,hyp_VIOLATION_DESCRIPTION,hyp_LOCATION,hyp_Boundaries_-_ZIP_Codes,hyp_Community_Areas,hyp_Zip_Codes,hyp_Census_Tracts,hyp_Wards,OAS_NOV_NUMBER,OAS_ISSUED_DATE,OAS_STREET_NUMBER,OAS_STREET_DIRECTION,OAS_STREET_NAME,OAS_UNIT_NUMBER,OAS_VIOLATION_DESCRIPTION,OAS_LOCATION,OAS_Boundaries_-_ZIP_Codes,OAS_Community_Areas,OAS_Zip_Codes,OAS_Census_Tracts,OAS_Wards
0,P004751507,03/31/2015,0.0,E,95TH ST,19.0,1,,,,,,,,,,,,,,,,,,,
1,P004815950,08/06/2015,0.0,S,LOTUS,15.0,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,,P004815950,08/06/2015,0.0,S,LOTUS,15.0,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
2,P004822018,08/25/2015,0.0,E,LAKE,211.0,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,,P004822018,08/25/2015,0.0,E,LAKE,211.0,8-4-030(A) DRINKING IN PUBLIC WAY.,,,,,,
3,P004937330,11/17/2015,0.0,W,CTA PLATFORM,701.0,1,,,,,,,P004937330,11/17/2015,0.0,W,CTA PLATFORM,701.0,1,,,,,,
4,P004937134,12/15/2015,0.0,W,0,701.0,1,,,,,,,P004937134,12/15/2015,0.0,W,0,701.0,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725642,,,,,,,,,,,,,,P0058749I8,08/I8/2020,500.0,S,PULASKI,610.0,I0-8-526/0I6-II0-I.04 ALCOHOL CONSUMPTION OR ...,POINT (-87.7254I9 4I.874387),30.0,27.0,21572.0,736.0,23.0
725643,,,,,,,,,,,,,,P005849I29,08/05/2020,7400.0,S,EMERALD,192.0,8-8-060 STREET SOLICITATION.,POINT (-87.643064 4I.759525),11.0,66.0,21559.0,511.0,32.0
725644,,,,,,,,,,,,,,P0058374I8,I0/I9/2020,0.0,W,TERMINAL,50.0,9-II4-280 LIVERY - SOLICITATION OF PASSENG...,,,,,,
725645,,,,,,,,,,,,,,P0060033I0,I0/02/2020,3600.0,W,LAKE ST,701.0,I0-8-526/0I6-II0-I.27(4) TRESPASS / ENTERING ...,POINT (-87.7I6389 4I.8850I6),30.0,28.0,21572.0,176.0,23.0


In [20]:
df_combo['hyp_NOV_NUMBER'].count()

649743

In [21]:
#python pandas count null and non-null values in column
#python pandas count null values in column
#https://stackoverflow.com/questions/26266362/how-do-i-count-the-nan-values-in-a-column-in-pandas-dataframe
df_combo['hyp_NOV_NUMBER'].isna().sum()

75904

### Create indicator columns comparing Hyperion Column to OAS Column

In [22]:
df_combo.columns

Index(['hyp_NOV_NUMBER', 'hyp_ISSUED_DATE', 'hyp_STREET_NUMBER',
       'hyp_STREET_DIRECTION', 'hyp_STREET_NAME', 'hyp_UNIT_NUMBER',
       'hyp_VIOLATION_DESCRIPTION', 'hyp_LOCATION',
       'hyp_Boundaries_-_ZIP_Codes', 'hyp_Community_Areas', 'hyp_Zip_Codes',
       'hyp_Census_Tracts', 'hyp_Wards', 'OAS_NOV_NUMBER', 'OAS_ISSUED_DATE',
       'OAS_STREET_NUMBER', 'OAS_STREET_DIRECTION', 'OAS_STREET_NAME',
       'OAS_UNIT_NUMBER', 'OAS_VIOLATION_DESCRIPTION', 'OAS_LOCATION',
       'OAS_Boundaries_-_ZIP_Codes', 'OAS_Community_Areas', 'OAS_Zip_Codes',
       'OAS_Census_Tracts', 'OAS_Wards'],
      dtype='object')

#### Test code below

In [23]:
#python check first 3 characters of string
#if 1st 3 letters of column is "hyp" -then compare current column to oas column
for colu in df_combo.columns:
    #print(colu)
    #print(colu[0:3])
    if(colu[0:3]=='hyp'):
        print(colu[0:3])
        print(colu[4:len(colu)])
        print(colu)
        print('OAS_'+colu[4:len(colu)])
        print()
        #print('\n')

hyp
NOV_NUMBER
hyp_NOV_NUMBER
OAS_NOV_NUMBER

hyp
ISSUED_DATE
hyp_ISSUED_DATE
OAS_ISSUED_DATE

hyp
STREET_NUMBER
hyp_STREET_NUMBER
OAS_STREET_NUMBER

hyp
STREET_DIRECTION
hyp_STREET_DIRECTION
OAS_STREET_DIRECTION

hyp
STREET_NAME
hyp_STREET_NAME
OAS_STREET_NAME

hyp
UNIT_NUMBER
hyp_UNIT_NUMBER
OAS_UNIT_NUMBER

hyp
VIOLATION_DESCRIPTION
hyp_VIOLATION_DESCRIPTION
OAS_VIOLATION_DESCRIPTION

hyp
LOCATION
hyp_LOCATION
OAS_LOCATION

hyp
Boundaries_-_ZIP_Codes
hyp_Boundaries_-_ZIP_Codes
OAS_Boundaries_-_ZIP_Codes

hyp
Community_Areas
hyp_Community_Areas
OAS_Community_Areas

hyp
Zip_Codes
hyp_Zip_Codes
OAS_Zip_Codes

hyp
Census_Tracts
hyp_Census_Tracts
OAS_Census_Tracts

hyp
Wards
hyp_Wards
OAS_Wards



In [24]:
for colu in df_combo.columns:
    print(colu)

hyp_NOV_NUMBER
hyp_ISSUED_DATE
hyp_STREET_NUMBER
hyp_STREET_DIRECTION
hyp_STREET_NAME
hyp_UNIT_NUMBER
hyp_VIOLATION_DESCRIPTION
hyp_LOCATION
hyp_Boundaries_-_ZIP_Codes
hyp_Community_Areas
hyp_Zip_Codes
hyp_Census_Tracts
hyp_Wards
OAS_NOV_NUMBER
OAS_ISSUED_DATE
OAS_STREET_NUMBER
OAS_STREET_DIRECTION
OAS_STREET_NAME
OAS_UNIT_NUMBER
OAS_VIOLATION_DESCRIPTION
OAS_LOCATION
OAS_Boundaries_-_ZIP_Codes
OAS_Community_Areas
OAS_Zip_Codes
OAS_Census_Tracts
OAS_Wards


#### Create indicator columns

In [25]:
#need to first create a copy, because the columns of df_combo_2 will be changing when we add indicator columns
df_combo_2=df_combo.copy(deep=True)

#use enumerate(i,colu)
for colu in df_combo.columns:
    #print(colu)
    #print(colu[0:3])
    if(colu[0:3]=='hyp'):
        print('column_1')
        df_combo_2['diff_'+colu[4:len(colu)]] = df_combo.apply(lambda x: 0 if x[colu]==x['OAS_'+colu[4:len(colu)]] else 1, axis=1)

column_1
column_1
column_1
column_1
column_1
column_1
column_1
column_1
column_1
column_1
column_1
column_1
column_1


In [26]:
colu[5:len(colu)]

'ards'

### Print Sum of each indicator column

In [27]:
for colu in df_combo_2.columns:
    if(colu[0:4]=='diff'):
        print(colu[5:len(colu)],'\t\t\t',df_combo_2[colu].sum())

NOV_NUMBER 			 151804
ISSUED_DATE 			 236364
STREET_NUMBER 			 151802
STREET_DIRECTION 			 156761
STREET_NAME 			 155306
UNIT_NUMBER 			 151800
VIOLATION_DESCRIPTION 			 195752
LOCATION 			 304262
Boundaries_-_ZIP_Codes 			 231439
Community_Areas 			 231440
Zip_Codes 			 229454
Census_Tracts 			 231146
Wards 			 231440


### For each column pair, print 1st ten mismatched rows

In [28]:
df_combo_2.columns

Index(['hyp_NOV_NUMBER', 'hyp_ISSUED_DATE', 'hyp_STREET_NUMBER',
       'hyp_STREET_DIRECTION', 'hyp_STREET_NAME', 'hyp_UNIT_NUMBER',
       'hyp_VIOLATION_DESCRIPTION', 'hyp_LOCATION',
       'hyp_Boundaries_-_ZIP_Codes', 'hyp_Community_Areas', 'hyp_Zip_Codes',
       'hyp_Census_Tracts', 'hyp_Wards', 'OAS_NOV_NUMBER', 'OAS_ISSUED_DATE',
       'OAS_STREET_NUMBER', 'OAS_STREET_DIRECTION', 'OAS_STREET_NAME',
       'OAS_UNIT_NUMBER', 'OAS_VIOLATION_DESCRIPTION', 'OAS_LOCATION',
       'OAS_Boundaries_-_ZIP_Codes', 'OAS_Community_Areas', 'OAS_Zip_Codes',
       'OAS_Census_Tracts', 'OAS_Wards', 'diff_NOV_NUMBER', 'diff_ISSUED_DATE',
       'diff_STREET_NUMBER', 'diff_STREET_DIRECTION', 'diff_STREET_NAME',
       'diff_UNIT_NUMBER', 'diff_VIOLATION_DESCRIPTION', 'diff_LOCATION',
       'diff_Boundaries_-_ZIP_Codes', 'diff_Community_Areas', 'diff_Zip_Codes',
       'diff_Census_Tracts', 'diff_Wards'],
      dtype='object')

In [29]:
for colu in df_combo_2.columns:
    if(colu[0:3]=='hyp'):
        print(colu)
        print(colu[4:len(colu)])
        print(colu,'OAS_'+colu[4:len(colu)])
       # print(df_combo_2[[colu,'OAS_'+colu[4:len(colu)]]])
        df_pair=df_combo_2[[colu,'OAS_'+colu[4:len(colu)]]]
        print(df_pair[df_combo_2['diff_'+colu[4:len(colu)]]==1].head(10))
        print('\n')
        #print(df_combo_2[colu])

hyp_NOV_NUMBER
NOV_NUMBER
hyp_NOV_NUMBER OAS_NOV_NUMBER
   hyp_NOV_NUMBER OAS_NOV_NUMBER
0      P004751507            NaN
7      P005051389            NaN
11     P005215381            NaN
12     P005124603            NaN
15     P005831197            NaN
16     P004617910            NaN
21     P004574122            NaN
36     P004689519            NaN
39     P004712715            NaN
43     P004579713            NaN


hyp_ISSUED_DATE
ISSUED_DATE
hyp_ISSUED_DATE OAS_ISSUED_DATE
   hyp_ISSUED_DATE OAS_ISSUED_DATE
0       03/31/2015             NaN
7       06/16/2016             NaN
11      10/18/2017             NaN
12      01/13/2018             NaN
15      08/07/2019             NaN
16      01/16/2015             NaN
21      01/08/2015             NaN
23      02/07/2015      02/07/20I5
32      01/26/2015      0I/26/20I5
33      02/08/2015      02/08/20I5


hyp_STREET_NUMBER
STREET_NUMBER
hyp_STREET_NUMBER OAS_STREET_NUMBER
    hyp_STREET_NUMBER  OAS_STREET_NUMBER
0                 0.0  

#### Filter first for where primary keys match, to find mismatches on other columns besides simple primary Key Mismatches