# Data Verification Tool - Simplified

## 🎯 How to Use:

1. **Run Cell 1** - Test that data is loaded
2. **Run Cell 2** - Load the geocoding data from CSV
3. **Run Cell 3** - Add the year column 
4. **Run Cell 4** - Load the data selector interface
5. **Run Cell 6** - Load the verification window interface
6. **Use the workflow:**
   - Run `launch_selector()` to choose what data to verify
   - Run `launch_verification()` to open the verification window
   - Use Yes/No/Maybe buttons to verify each record
   - Run `save_results()` to save your work

## 🔧 What this tool does:

- **Loads geocoding data** comparing OCR (scanned text) vs Scraped (website data)
- **Provides a GUI** to manually verify if the matches are correct
- **Saves verification results** back to CSV files for further analysis

## ✨ Features:

- **Simple data selection** - Choose from most common flag reasons or random sample
- **Side-by-side comparison** - See OCR data vs Scraped data clearly
- **Quick verification** - Yes/No/Maybe buttons with auto-advance
- **External tools** - Google search and website opening for verification
- **Auto-save** - Results automatically saved when complete

## 📊 Workflow:

1. **Select data type** using the selector window
2. **Verify matches** using the comparison window  
3. **Save results** to CSV when done

In [5]:
import pandas as pd

# Load the main data
df = pd.read_csv(r'C:\Users\clint\Desktop\Geocoding_Task\Matching_WebScrape\8.csv')

# Filter for specific states only
df = df[df['OCR_state'].isin(['CA', 'UT', 'NV', 'AZ'])]

print(f"Data loaded: {len(df):,} rows for states CA, UT, NV, AZ")
df.head()

Data loaded: 4,185 rows for states CA, UT, NV, AZ


  df = pd.read_csv(r'C:\Users\clint\Desktop\Geocoding_Task\Matching_WebScrape\8.csv')


Unnamed: 0,OCR_Unnamed: 0,OCR_filename,OCR_record_num,OCR_clean_line1,OCR_clean_line2,OCR_line3,OCR_city,OCR_zip_code,OCR_label,OCR_phone,...,OCR_Address_Type,OCR_Exit_Number,Exit_From_Address,Exit_From_Label,Is_Unclear_OCR,OCR_Main_Road,OCR_Secondary_Road,Manually Verified?,Scraped_Year,Flag_Category
43950,23197,RVersFriend2006-115-ocr.csv,18,"Coalville , 84017 Holiday Hills ( 66 )",435-336-4421 1-80 Exit 162 ( UT 280 ),MO,Coalville,84017,Holiday Hills ( 66 ),435-336-4421,...,Exit,162,162,,False,I-80,,,2025,Successful Matches
43951,23197,RVersFriend2006-115-ocr.csv,18,"Coalville , 84017 Holiday Hills ( 66 )",435-336-4421 1-80 Exit 162 ( UT 280 ),MO,Coalville,84017,Holiday Hills ( 66 ),435-336-4421,...,Exit,162,162,,False,I-80,,,2025,Successful Matches
43952,23198,RVersFriend2007-104-ocr.csv,7,"Coalville , 84017 Hills ( 66 )",435-336-4421 1-80 Holiday Exit 162 ( UT 280 ),M <U+25A1>,Coalville,84017,Hills ( 66 ),435-336-4421,...,Exit,162,162,,False,I-80,,,2025,Successful Matches
43953,23198,RVersFriend2007-104-ocr.csv,7,"Coalville , 84017 Hills ( 66 )",435-336-4421 1-80 Holiday Exit 162 ( UT 280 ),M <U+25A1>,Coalville,84017,Hills ( 66 ),435-336-4421,...,Exit,162,162,,False,I-80,,,2025,Successful Matches
43954,23199,TF2008_258_271-0-ocr.csv,7,"D Coalville , 84017 Holiday Hills ( 66 ) )",4 435-336-4421 1-80 Exit 162 ( UT 280 ),M <U+2610> <U+2610> <U+2610>,Coalville,84017,Holiday Hills ( 66 ) ),435-336-4421,...,Exit,162,162,,False,I-80,,,2025,Successful Matches


In [2]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import tkinter as tk
from tkinter import ttk, messagebox
import threading

# Simplified Flag Reason Selector
class SimpleFlagSelector:
    def __init__(self, df):
        self.df = df
        self.selected_flag_reason = None
        self.sample_df = None
        
        # Create the window
        self.root = tk.Tk()
        self.root.title("🎯 Flag Reason Selector")
        self.root.geometry("600x400")
        self.root.configure(bg='#f0f0f0')
        
        self.setup_ui()
        
    def setup_ui(self):
        """Setup simple UI with main options"""
        main_frame = ttk.Frame(self.root)
        main_frame.pack(fill=tk.BOTH, expand=True, padx=15, pady=15)
        
        # Title
        title_label = tk.Label(main_frame, text="🎯 Select Data Type to Verify", 
                              font=('Arial', 16, 'bold'), bg='#f0f0f0')
        title_label.pack(pady=(0, 20))
        
        # Get flag reason counts
        flag_counts = self.df['Flag_Reason'].value_counts()
        
        # Create buttons for main flag reasons
        for flag_reason in flag_counts.head(10).index:  # Top 10 most common
            count = flag_counts[flag_reason]
            button_text = f"{flag_reason}\n({count:,} rows)"
            
            btn = tk.Button(main_frame, text=button_text,
                           command=lambda fr=flag_reason: self.select_flag_reason(fr),
                           font=('Arial', 10), pady=10, width=50)
            btn.pack(pady=5, fill=tk.X)
        
        # Random sample option
        random_btn = tk.Button(main_frame, text=f"🎲 Random Sample (20 rows)",
                              command=self.select_random_sample,
                              font=('Arial', 10, 'bold'), bg='#28a745', fg='white',
                              pady=10)
        random_btn.pack(pady=10, fill=tk.X)
    
    def select_flag_reason(self, flag_reason):
        """Select a specific flag reason"""
        self.selected_flag_reason = flag_reason
        filtered_df = self.df[self.df['Flag_Reason'] == flag_reason].copy()
        
        # Take sample (max 5 rows for quick verification)
        sample_size = min(5, len(filtered_df))
        self.sample_df = filtered_df.sample(n=sample_size) if len(filtered_df) > 0 else filtered_df
        
        # Add verification columns
        if 'Manually Verified?' not in self.sample_df.columns:
            self.sample_df['Manually Verified?'] = ''
        if 'Verify Reason' not in self.sample_df.columns:
            self.sample_df['Verify Reason'] = ''
        
        # Store globally
        globals()['sample_df'] = self.sample_df
        globals()['selected_flag_reason'] = flag_reason
        
        print(f"Selected: {flag_reason}")
        print(f"Sample size: {len(self.sample_df)} rows")
        
        self.root.destroy()
    
    def select_random_sample(self):
        """Select random sample"""
        self.selected_flag_reason = "Random Sample"
        sample_size = min(20, len(self.df))
        self.sample_df = self.df.sample(n=sample_size)
        
        # Add verification columns
        if 'Manually Verified?' not in self.sample_df.columns:
            self.sample_df['Manually Verified?'] = ''
        if 'Verify Reason' not in self.sample_df.columns:
            self.sample_df['Verify Reason'] = ''
        
        # Store globally
        globals()['sample_df'] = self.sample_df
        globals()['selected_flag_reason'] = self.selected_flag_reason
        
        print(f"Selected random sample: {len(self.sample_df)} rows")
        self.root.destroy()
    
    def run(self):
        """Run the selector"""
        self.root.mainloop()

# Function to launch selector
def launch_selector():
    """Launch the flag reason selector"""
    selector = SimpleFlagSelector(df)
    selector.run()
    return selector

print("🔧 Selector ready. Use launch_selector() to choose data to verify.")

🔧 Selector ready. Use launch_selector() to choose data to verify.


In [3]:
# Save verification results
def save_results(filename=None):
    """Save verification results to CSV"""
    if 'sample_df' not in globals():
        print("❌ No sample data found. Run verification first.")
        return None
    
    # Update main dataframe with verification results
    sample_df = globals()['sample_df']
    for idx in sample_df.index:
        if idx in df.index:
            df.loc[idx, 'Manually Verified?'] = sample_df.loc[idx, 'Manually Verified?']
            df.loc[idx, 'Verify Reason'] = sample_df.loc[idx, 'Verify Reason']
    
    # Generate filename if not provided
    if filename is None:
        from datetime import datetime
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"verified_data_{timestamp}.csv"
    
    # Save to file
    filepath = f"C:\\Users\\clint\\Desktop\\Geocoding_Task\\Matching_WebScrape\\{filename}"
    df.to_csv(filepath, index=False)
    
    # Show summary
    verified_count = len(sample_df[sample_df['Manually Verified?'] != ''])
    print(f"✅ Saved to: {filename}")
    print(f"📊 Verified {verified_count} out of {len(sample_df)} rows")
    
    return filepath

print("💾 Save function ready. Use save_results() after verification.")

💾 Save function ready. Use save_results() after verification.


In [4]:
import tkinter as tk
from tkinter import ttk, messagebox
import threading
import webbrowser
import urllib.parse
from datetime import datetime

class DataVerificationWindow:
    def __init__(self, sample_df):
        self.sample_df = sample_df.copy()
        self.current_row_index = 0
        
        # Create main window
        self.root = tk.Tk()
        self.root.title("🔍 Data Verification Tool")
        self.root.geometry("1000x700")
        self.root.configure(bg='#f0f0f0')
        
        self.setup_ui()
        self.load_current_row()
        
    def setup_ui(self):
        """Setup the verification window UI"""
        # Header
        header_frame = tk.Frame(self.root, bg='#f0f0f0')
        header_frame.pack(fill=tk.X, padx=20, pady=10)
        
        tk.Label(header_frame, text="🔍 Data Verification Tool", 
                font=('Arial', 18, 'bold'), bg='#f0f0f0').pack()
        
        self.progress_label = tk.Label(header_frame, text="", 
                                      font=('Arial', 12), bg='#f0f0f0')
        self.progress_label.pack(pady=5)
        
        # Navigation
        nav_frame = tk.Frame(header_frame, bg='#f0f0f0')
        nav_frame.pack(pady=10)
        
        self.prev_btn = tk.Button(nav_frame, text="⬅️ Previous", 
                                 command=self.previous_row, font=('Arial', 10))
        self.prev_btn.pack(side=tk.LEFT, padx=5)
        
        self.next_btn = tk.Button(nav_frame, text="Next ➡️", 
                                 command=self.next_row, font=('Arial', 10))
        self.next_btn.pack(side=tk.LEFT, padx=5)
        
        # Data comparison area
        data_frame = tk.Frame(self.root, bg='white', relief=tk.RAISED, bd=2)
        data_frame.pack(fill=tk.BOTH, expand=True, padx=20, pady=(0, 20))
        
        # Left (OCR) and Right (Scraped) panels
        left_frame = tk.Frame(data_frame, bg='white')
        left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=10, pady=10)
        
        right_frame = tk.Frame(data_frame, bg='white')
        right_frame.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True, padx=10, pady=10)
        
        tk.Label(left_frame, text="📄 OCR Data", font=('Arial', 14, 'bold')).pack()
        tk.Label(right_frame, text="🌐 Scraped Data", font=('Arial', 14, 'bold')).pack()
        
        self.ocr_text = tk.Text(left_frame, font=('Courier', 10), height=15)
        self.ocr_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        self.scraped_text = tk.Text(right_frame, font=('Courier', 10), height=15)
        self.scraped_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        # Verification section
        verify_frame = tk.Frame(self.root, bg='#e9ecef', relief=tk.RAISED, bd=2)
        verify_frame.pack(fill=tk.X, padx=20, pady=(0, 10))
        
        tk.Label(verify_frame, text="✅ Verification Decision", 
                font=('Arial', 14, 'bold'), bg='#e9ecef').pack(pady=10)
        
        # Decision buttons
        button_frame = tk.Frame(verify_frame, bg='#e9ecef')
        button_frame.pack(pady=10)
        
        tk.Button(button_frame, text="✅ YES - Good Match", 
                 command=lambda: self.make_decision('Yes'),
                 font=('Arial', 12, 'bold'), bg='#28a745', fg='white',
                 padx=20, pady=10).pack(side=tk.LEFT, padx=10)
        
        tk.Button(button_frame, text="❓ MAYBE - Uncertain", 
                 command=lambda: self.make_decision('Maybe'),
                 font=('Arial', 12, 'bold'), bg='#ffc107', fg='black',
                 padx=20, pady=10).pack(side=tk.LEFT, padx=10)
        
        tk.Button(button_frame, text="❌ NO - Bad Match", 
                 command=lambda: self.make_decision('No'),
                 font=('Arial', 12, 'bold'), bg='#dc3545', fg='white',
                 padx=20, pady=10).pack(side=tk.LEFT, padx=10)
        
        # Reason text box
        tk.Label(verify_frame, text="💭 Reason (optional):", 
                font=('Arial', 10), bg='#e9ecef').pack(anchor='w', padx=20)
        
        self.reason_entry = tk.Text(verify_frame, height=3, font=('Arial', 10))
        self.reason_entry.pack(fill=tk.X, padx=20, pady=(5, 10))
        
        # Action buttons
        action_frame = tk.Frame(self.root, bg='#f0f0f0')
        action_frame.pack(fill=tk.X, padx=20, pady=(0, 10))
        
        tk.Button(action_frame, text="🔍 Google Search", 
                 command=self.google_search, bg='#007bff', fg='white').pack(side=tk.LEFT, padx=5)
        
        tk.Button(action_frame, text="🌐 Open Website", 
                 command=self.open_website, bg='#17a2b8', fg='white').pack(side=tk.LEFT, padx=5)
        
        tk.Button(action_frame, text="💾 Save to CSV", 
                 command=self.save_to_csv, bg='#6f42c1', fg='white').pack(side=tk.RIGHT, padx=5)
    
    def make_decision(self, decision):
        """Handle verification decision"""
        current_index = self.sample_df.index[self.current_row_index]
        reason = self.reason_entry.get("1.0", tk.END).strip()
        
        if not reason:
            reason = f"Auto: {decision} via verification tool"
        
        # Update the dataframe
        self.sample_df.loc[current_index, 'Manually Verified?'] = decision
        self.sample_df.loc[current_index, 'Verify Reason'] = reason
        
        # Update global variable
        if 'sample_df' in globals():
            globals()['sample_df'].loc[current_index, 'Manually Verified?'] = decision
            globals()['sample_df'].loc[current_index, 'Verify Reason'] = reason
        
        print(f"✅ Recorded: {decision} for row {current_index}")
        
        # Auto-advance to next row
        self.root.after(1000, self.next_row)
    
    def load_current_row(self):
        """Load data for current row"""
        if self.current_row_index >= len(self.sample_df):
            self.show_completion()
            return
        
        current_index = self.sample_df.index[self.current_row_index]
        row_data = self.sample_df.loc[current_index]
        
        # Update progress
        self.progress_label.config(text=f"Row {self.current_row_index + 1} of {len(self.sample_df)}")
        
        # Clear and load OCR data
        self.ocr_text.delete("1.0", tk.END)
        ocr_fields = ['OCR_city', 'OCR_zip_code', 'OCR_label', 'OCR_phone', 
                     'OCR_address_standardized', 'OCR_state', 'OCR_chain']
        
        ocr_data = []
        for field in ocr_fields:
            if field in row_data:
                ocr_data.append(f"{field}: {row_data[field]}")
        
        self.ocr_text.insert("1.0", "\n".join(ocr_data))
        
        # Clear and load Scraped data
        self.scraped_text.delete("1.0", tk.END)
        scraped_fields = ['Scraped_name', 'Scraped_Highway', 'Scraped_Exit', 
                         'Scraped_Street Address', 'Scraped_City', 'Scraped_State', 
                         'Scraped_Postal Code', 'Scraped_Phone', 'Flag_Reason']
        
        scraped_data = []
        for field in scraped_fields:
            if field in row_data:
                scraped_data.append(f"{field}: {row_data[field]}")
        
        self.scraped_text.insert("1.0", "\n".join(scraped_data))
        
        # Clear reason box
        self.reason_entry.delete("1.0", tk.END)
        
        # Update navigation buttons
        self.prev_btn.config(state='normal' if self.current_row_index > 0 else 'disabled')
        self.next_btn.config(state='normal' if self.current_row_index < len(self.sample_df) - 1 else 'disabled')
    
    def next_row(self):
        """Move to next row"""
        if self.current_row_index < len(self.sample_df) - 1:
            self.current_row_index += 1
            self.load_current_row()
        else:
            self.show_completion()
    
    def previous_row(self):
        """Move to previous row"""
        if self.current_row_index > 0:
            self.current_row_index -= 1
            self.load_current_row()
    
    def google_search(self):
        """Open Google search"""
        current_index = self.sample_df.index[self.current_row_index]
        row_data = self.sample_df.loc[current_index]
        
        search_terms = []
        name = row_data.get('OCR_label', '')
        city = row_data.get('OCR_city', '')
        state = row_data.get('OCR_state', '')
        
        if name and str(name) not in ['nan', 'N/A', '']:
            search_terms.append(f'"{name}"')
        if city and str(city) not in ['nan', 'N/A', '']:
            search_terms.append(city)
        if state and str(state) not in ['nan', 'N/A', '']:
            search_terms.append(state)
        
        if search_terms:
            query = ' '.join(search_terms)
            url = f"https://www.google.com/search?q={urllib.parse.quote(query)}"
            webbrowser.open(url)
    
    def open_website(self):
        """Open scraped website"""
        current_index = self.sample_df.index[self.current_row_index]
        row_data = self.sample_df.loc[current_index]
        
        url = row_data.get('Scraped_full_url', '')
        if url and str(url) not in ['nan', 'N/A', '']:
            webbrowser.open(url)
        else:
            messagebox.showinfo("No URL", "No website URL available")
    
    def save_to_csv(self):
        """Save current progress to CSV"""
        # Update global variables
        globals()['sample_df'] = self.sample_df.copy()
        
        # Call the main save function
        if 'df' in globals():
            filepath = save_results()
            if filepath:
                filename = filepath.split('\\')[-1]
                messagebox.showinfo("Saved!", f"Data saved to: {filename}")
            else:
                messagebox.showerror("Error", "Failed to save data")
        else:
            messagebox.showerror("Error", "Main dataframe not found")
    
    def show_completion(self):
        """Show completion message and auto-save"""
        print("🎉 Verification complete! Auto-saving...")
        self.save_to_csv()
        
        verified_count = len(self.sample_df[self.sample_df['Manually Verified?'] != ''])
        messagebox.showinfo("Complete!", 
                           f"🎉 All {len(self.sample_df)} rows verified!\n"
                           f"📊 Total verified: {verified_count} rows\n"
                           f"Results saved to CSV.")
    
    def run(self):
        """Run the verification window"""
        self.root.mainloop()

# Launch function
def launch_verification():
    """Launch the verification window"""
    if 'sample_df' not in globals():
        print("❌ No sample data selected. Run launch_selector() first.")
        return None
    
    sample_df = globals()['sample_df']
    print(f"🚀 Launching verification for {len(sample_df)} rows...")
    
    def run_window():
        window = DataVerificationWindow(sample_df)
        window.run()
        globals()['sample_df'] = window.sample_df
    
    thread = threading.Thread(target=run_window, daemon=True)
    thread.start()
    
    print("✅ Verification window launched!")
    return thread

print("🔧 Verification window ready. Use launch_verification() after selecting data.")

🔧 Verification window ready. Use launch_verification() after selecting data.
