In [3]:
#https://play.google.com/store/apps/details?id=camera.timestamp.gpsmap.photo.gpscamera
#this script creates a text file for GPS data captured from above given app
import os
import re
import cv2
import pytesseract
from PIL import Image, ImageEnhance

# === Config ===
image_folder = r"G:\Group_2_Road_Defect_Analysis\Prediction_Output"
label_folder = r"G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels"
output_folder = r"G:\Group_2_Road_Defect_Analysis\Prediction_Output\gps_text"
os.makedirs(output_folder, exist_ok=True)

pytesseract.pytesseract.tesseract_cmd = r"C:\Users\HP\tesseract.exe"
class_names = ['edge line', 'Potholes', 'crack', 'lane divider', 'objects', 'zebra crossing']

# === Helper Functions ===

def extract_text(image_path):
    img = Image.open(image_path).convert("L")  # Grayscale
    img = img.resize((img.width * 2, img.height * 2))  # Enlarge
    img = ImageEnhance.Contrast(img).enhance(2)  # Increase contrast
    text = pytesseract.image_to_string(img)
    return text

def parse_location_and_gps(text):
    location = None
    lat, lon = None, None

    # === Location pattern: 'Link Road Ahmednagar Maharashtra'
    loc_match = re.search(r'(Link Road [A-Za-z\s]+Maharashtra)', text)
    if loc_match:
        location = loc_match.group(1).strip()

    # === Decimal degrees pattern: Lat: xx.xxxx Long: xx.xxxx OR with pipe |
    match_dec = re.search(r'Lat[: ]+([\d.]+)\s*[|]?\s*Long[: ]+([\d.]+)', text)
    if match_dec:
        lat = float(match_dec.group(1))
        lon = float(match_dec.group(2))

    # === Optional fallback: DMS format (not needed for your current OCR but kept just in case)
    if not lat or not lon:
        match_dms = re.search(
            r'(\d{1,2})°\s*(\d{1,2})[′\'’]?\s*(\d{1,2})?[″"\s]*[Nn],?\s*(\d{1,3})°\s*(\d{1,2})[′\'’]?\s*(\d{1,2})?[″"\s]*[Ee]',
            text
        )
        if match_dms:
            d1, m1, s1 = int(match_dms.group(1)), int(match_dms.group(2)), int(match_dms.group(3) or 0)
            d2, m2, s2 = int(match_dms.group(4)), int(match_dms.group(5)), int(match_dms.group(6) or 0)
            lat = d1 + m1 / 60 + s1 / 3600
            lon = d2 + m2 / 60 + s2 / 3600

    return location, lat, lon

def read_labels(label_path):
    if not os.path.exists(label_path):
        return []
    with open(label_path, 'r') as f:
        lines = f.readlines()
        return list(set(class_names[int(line.split()[0])] for line in lines if line.strip()))

# === Main Processing ===

for file in os.listdir(image_folder):
    if not file.lower().endswith('.jpg'):
        continue

    image_path = os.path.join(image_folder, file)
    label_path = os.path.join(label_folder, file.replace('.jpg', '.txt'))
    output_txt_path = os.path.join(output_folder, file.replace('.jpg', '.txt'))

    ocr_text = extract_text(image_path)
    location, lat, lon = parse_location_and_gps(ocr_text)
    defects = read_labels(label_path)

    with open(output_txt_path, 'w', encoding='utf-8') as out:
        out.write(f"Frame: {file}\n")
        out.write(f"Location: {location or 'Unknown'}\n")
        out.write(f"Latitude: {lat if lat else 'NA'}\n")
        out.write(f"Longitude: {lon if lon else 'NA'}\n")
        out.write(f"Defects: {', '.join(defects) if defects else 'None'}\n")

print("✅ All frame data saved as .txt files.")


✅ All frame data saved as .txt files.


In [4]:
#this code creates a csv file for tha GPS data we have fetched in text file

import os
import csv

# === Configuration ===
gps_txt_dir = r"G:\Group_2_Road_Defect_Analysis\Prediction_Output\gps_text"
yolo_labels_dir = r"G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels"
output_csv_path = r"G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels\defect_summary.csv"

class_names = ['edge line', 'Potholes', 'crack', 'lane divider', 'objects', 'zebra crossing']

# === Functions ===

def extract_gps_info(filepath):
    lat, lon, location = None, None, None
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith("Latitude:"):
                value = line.split("Latitude:")[1].strip()
                lat = float(value) if value.replace('.', '', 1).isdigit() else None
            elif line.startswith("Longitude:"):
                value = line.split("Longitude:")[1].strip()
                lon = float(value) if value.replace('.', '', 1).isdigit() else None
            elif line.startswith("Location:"):
                location = line.split("Location:")[1].strip()
    return location, lat, lon

def extract_labels(yolo_file):
    if not os.path.exists(yolo_file):
        print(f"⚠️ Label file not found: {yolo_file}")
        return []

    class_ids = []
    with open(yolo_file, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split()
            if parts and parts[0].isdigit():
                class_ids.append(int(parts[0]))
            else:
                print(f"⚠️ Skipping invalid label line: {line.strip()} in {yolo_file}")
    return [class_names[i] for i in class_ids if i < len(class_names)]

# === Main Processing ===
data = []
all_gps_files = os.listdir(gps_txt_dir)
print(f"📝 Found {len(all_gps_files)} GPS text files.")

for fname in all_gps_files:
    if not fname.endswith('.txt'):
        continue

    frame_name = os.path.splitext(fname)[0] + '.jpg'
    txt_path = os.path.join(gps_txt_dir, fname)
    yolo_label_path = os.path.join(yolo_labels_dir, os.path.splitext(fname)[0] + '.txt')

    location, lat, lon = extract_gps_info(txt_path)
    labels = extract_labels(yolo_label_path)

    print(f"📂 Frame: {frame_name} | Location: {location} | Lat: {lat}, Lon: {lon} | Labels: {labels}")

    if location and lat is not None and lon is not None and labels:
        unique_labels = sorted(set(labels))
        defect_str = ', '.join(unique_labels)
        count = len(labels)

        data.append({
            'frame': frame_name,
            'location': location,
            'latitude': lat,
            'longitude': lon,
            'defects': defect_str,
            'count': count
        })
    else:
        print(f"⚠️ Skipping {frame_name} due to missing data")

# === Save to CSV ===
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['frame', 'location', 'latitude', 'longitude', 'defects', 'count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print(f"✅ Saved {len(data)} valid entries to {output_csv_path}")


📝 Found 2357 GPS text files.
⚠️ Label file not found: G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels\frame_00000.txt
📂 Frame: frame_00000.jpg | Location: Unknown | Lat: 19.0957, Lon: 74.75153 | Labels: []
⚠️ Skipping frame_00000.jpg due to missing data
📂 Frame: frame_00001.jpg | Location: Unknown | Lat: 19.0957, Lon: 74.75153 | Labels: ['edge line']
📂 Frame: frame_00002.jpg | Location: Unknown | Lat: 19.0957, Lon: 74.75153 | Labels: ['edge line']
📂 Frame: frame_00003.jpg | Location: Unknown | Lat: 19.0957, Lon: 74.75153 | Labels: ['edge line']
📂 Frame: frame_00004.jpg | Location: Unknown | Lat: 19.0957, Lon: 74.75153 | Labels: ['edge line']
📂 Frame: frame_00005.jpg | Location: Unknown | Lat: 19.0957, Lon: 74.75153 | Labels: ['edge line']
📂 Frame: frame_00006.jpg | Location: Unknown | Lat: 19.0957, Lon: 74.75153 | Labels: ['edge line']
📂 Frame: frame_00007.jpg | Location: Unknown | Lat: 19.0957, Lon: 74.75153 | Labels: ['edge line']
📂 Frame: frame_00008.jpg | Location: Unknown

In [5]:
# as we can see in above csv file only latitude and longitude is extracted
# this script extract the location from given co-ordinates
!pip install geopy



import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# --- Step 3: Load your data ---
# This line assumes your CSV file is named 'defect_summary_filled.csv' and is uploaded to Colab.
try:
    df = pd.read_csv(r"G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels\defect_summary.csv")
    print("DataFrame loaded successfully.")
    print("First 5 rows of the loaded DataFrame:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'defect_summary_filled.csv' not found.")
    print("Please ensure the file is uploaded to your Colab session storage (Step 2).")
    # If the file is not found, the rest of the script cannot proceed.
    exit() # Exit the script if the file isn't found

# --- Step 4: Initialize the geocoder ---
# We are using Nominatim, which is a geocoding service based on OpenStreetMap data.
# IMPORTANT: Provide a unique 'user_agent' string (e.g., your app name or email).
# This is required by Nominatim's usage policy to identify your application.
geolocator = Nominatim(user_agent="my_colab_geocoding_project")

# --- Step 5: Implement a rate limiter to avoid hitting API limits ---
# Geocoding services have usage policies and often rate limits (e.g., 1 request per second).
# The RateLimiter will automatically pause requests to ensure compliance.
# This is CRUCIAL to prevent your IP from being temporarily blocked by the service.
geocode = RateLimiter(geolocator.reverse, min_delay_seconds=1)

# --- Step 6: Define a helper function to get location for a single row ---
def get_location_from_coords(row):
    # Only attempt geocoding if both latitude and longitude are valid numbers (not NaN)
    if pd.notna(row['latitude']) and pd.notna(row['longitude']):
        try:
            # Create a tuple of coordinates (latitude, longitude)
            coords = (row['latitude'], row['longitude'])
            # Call the reverse geocoding service
            location = geocode(coords)
            # If a location is found, return its full address, otherwise return None
            return location.address if location else None
        except Exception as e:
            # Print any errors encountered during the geocoding request for debugging
            print(f"Error geocoding coordinates ({row['latitude']}, {row['longitude']}): {e}")
            return None
    return None # Return None if coordinates are missing or invalid

# --- Step 7: Prepare the 'location' column for filling ---
# Ensure that any existing 'Unknown' or 'NA' string values in the 'location' column
# are converted to actual pandas NaN (Not a Number) values. This allows `pd.isna()`
# to correctly identify which cells need to be filled.
df['location'] = df['location'].replace(['Unknown', 'NA'], pd.NA)

# --- Step 8: Apply the geocoding function to fill missing locations ---
print("\nStarting geocoding process. This might take a while depending on:")
print("  - The number of missing locations.")
print("  - The rate limit of the geocoding API (e.g., 1 second per request).")

# Identify rows where the 'location' column is currently missing (NaN)
missing_location_mask = df['location'].isna()

# Apply the 'get_location_from_coords' function ONLY to the rows
# where the 'location' is missing. This optimizes the process.
df.loc[missing_location_mask, 'location'] = df[missing_location_mask].apply(get_location_from_coords, axis=1)

print("\nGeocoding process complete.")

# --- Step 9: Display the updated DataFrame and check for any remaining missing values ---
print("\nDataFrame after attempting to fill missing locations:")
print(df.head())

print("\nNumber of missing values in 'location' column after geocoding attempt:")
print(df['location'].isna().sum())

# --- Step 10: (Optional) Save the updated DataFrame to a new CSV file ---
# You can download this new CSV file from Colab's file browser after it's created.
#df.to_csv('defect_summary_geocoded_updated.csv', index=False)
# print("\nUpdated DataFrame saved to 'defect_summary_geocoded_updated.csv'")

DataFrame loaded successfully.
First 5 rows of the loaded DataFrame:
             frame location  latitude  longitude    defects  count
0  frame_00001.jpg  Unknown   19.0957   74.75153  edge line      1
1  frame_00002.jpg  Unknown   19.0957   74.75153  edge line      1
2  frame_00003.jpg  Unknown   19.0957   74.75153  edge line      1
3  frame_00004.jpg  Unknown   19.0957   74.75153  edge line      1
4  frame_00005.jpg  Unknown   19.0957   74.75153  edge line      1

Starting geocoding process. This might take a while depending on:
  - The number of missing locations.
  - The rate limit of the geocoding API (e.g., 1 second per request).


  return cls(*args)


Error geocoding coordinates (19092632.0, 74.76649): Must be a coordinate pair or Point

Geocoding process complete.

DataFrame after attempting to fill missing locations:
             frame                                           location  \
0  frame_00001.jpg  Foi, अहिल्यानगर, Ahmednagar, Maharashtra, 4140...   
1  frame_00002.jpg  Foi, अहिल्यानगर, Ahmednagar, Maharashtra, 4140...   
2  frame_00003.jpg  Foi, अहिल्यानगर, Ahmednagar, Maharashtra, 4140...   
3  frame_00004.jpg  Foi, अहिल्यानगर, Ahmednagar, Maharashtra, 4140...   
4  frame_00005.jpg  Foi, अहिल्यानगर, Ahmednagar, Maharashtra, 4140...   

   latitude  longitude    defects  count  
0   19.0957   74.75153  edge line      1  
1   19.0957   74.75153  edge line      1  
2   19.0957   74.75153  edge line      1  
3   19.0957   74.75153  edge line      1  
4   19.0957   74.75153  edge line      1  

Number of missing values in 'location' column after geocoding attempt:
1


In [7]:
# You can download this new CSV file from Colab's file browser after it's created.
df.to_csv('G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels\defect_summary_geocoded_updated.csv', index=False)
print("\nUpdated DataFrame saved to 'defect_summary_geocoded_updated.csv'")


Updated DataFrame saved to 'defect_summary_geocoded_updated.csv'


  df.to_csv('G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels\defect_summary_geocoded_updated.csv', index=False)


In [8]:
#there is text written in marathi but as it only refers to ahmednagar itself we removed it using this script

import pandas as pd

# Load the CSV (without 'errors' argument)
input_file = r'G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels\defect_summary_geocoded_updated.csv'  # Update path if needed
df = pd.read_csv(input_file, encoding='utf-8')

# Function to clean location text by removing non-ASCII parts
def remove_garbled_marathi(text):
    if isinstance(text, str):
        # Keep only parts that contain mostly ASCII characters
        parts = text.split(",")
        cleaned_parts = [part.strip() for part in parts if all(ord(c) < 128 for c in part)]
        return ", ".join(cleaned_parts)
    return text

# Apply the function to the 'location' column
df['location'] = df['location'].apply(remove_garbled_marathi)

# Save the cleaned DataFrame
output_file = r"G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels\summary_cleaned.csv"
df.to_csv(output_file, index=False, encoding='utf-8')

print(f"✅ Cleaned CSV saved to: {output_file}")


✅ Cleaned CSV saved to: G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels\summary_cleaned.csv


In [10]:
import pandas as pd

# Load your CSV file
csv_path = r"G:\Group_2_Road_Defect_Analysis\Prediction_Output\labels\summary_cleaned.csv"
df = pd.read_csv(csv_path)

# Remove "Foi, " if it appears at the start of the location
df['location'] = df['location'].str.replace(r'^Foi,\s*', '', regex=True)

# Save the cleaned CSV
df.to_csv(csv_path, index=False)
print("✅ Removed 'Foi,' from location column and saved updated CSV.")


✅ Removed 'Foi,' from location column and saved updated CSV.
