In [7]:

import json
import pandas as pd
def json_to_dataframe(file_path):
        """
        Read a JSON file and convert it to a Pandas DataFrame.

        Parameters:
        - file_path: str, path to the JSON file.
        - csv_file_name: str, name of the output CSV file (default is 'output.csv').

        Returns:
        - DataFrame containing the JSON data.
        """
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)

        df = pd.DataFrame(data)

        print(df.head())


        return df

    # Example usage of json_to_dataframe()
    #file_path = "Outscraper-20241011183106xs96_fine_dining_restaurant.json"
    #file_path = ""
    #  path up^^^^^^
    #df = json_to_dataframe(file_path)


In [8]:
df = json_to_dataframe("/Users/remaalnssiry/code/Remafsa/geo-pic/geo-pic/data/sample.json")


          query          name              google_id  \
0  Jax District  Jax District  24.7451938,46.5353316   
1  Jax District  Jax District  24.7451938,46.5353316   
2  Jax District  Jax District  24.7451938,46.5353316   
3  Jax District  Jax District  24.7451938,46.5353316   
4  Jax District  Jax District  24.7451938,46.5353316   

                      place_id  \
0  ChIJ5dLZmB_hLj4R-XYc8tPWpJ8   
1  ChIJ5dLZmB_hLj4R-XYc8tPWpJ8   
2  ChIJ5dLZmB_hLj4R-XYc8tPWpJ8   
3  ChIJ5dLZmB_hLj4R-XYc8tPWpJ8   
4  ChIJ5dLZmB_hLj4R-XYc8tPWpJ8   

                                       location_link  \
0  https://maps.google.com/?cid=11503555553999484665   
1  https://maps.google.com/?cid=11503555553999484665   
2  https://maps.google.com/?cid=11503555553999484665   
3  https://maps.google.com/?cid=11503555553999484665   
4  https://maps.google.com/?cid=11503555553999484665   

                                            photo_id  \
0  AdCG2DM2RR9zbOaySKJHoCaCpmSOaOMN7wr-HXKioAOcSG...   
1  AdCG2D

In [9]:
df.columns

Index(['query', 'name', 'google_id', 'place_id', 'location_link', 'photo_id',
       'photo_url', 'photo_url_big', 'latitude', 'longitude', 'photo_date',
       'photo_upload_source', 'photo_source_video', 'photo_tags',
       'photo_tag_ids', 'original_photo_url'],
      dtype='object')

In [10]:
import json
import requests
import pandas as pd
import os

In [13]:

def download_images_from_df(df, url_column='photo_url', lat_column='latitude', lon_column='latitude', save_folder='Images', save_csv=False, csv_file_name='output.csv'):
        """
        Download images from a DataFrame and save them locally in a specified folder.

        Parameters:
        - df: DataFrame containing image URLs, latitudes, and longitudes.
        - url_column: str, name of the column containing the image URLs (default is 'photo_url').
        - lat_column: str, name of the column containing the latitudes (default is 'LAT').
        - lon_column: str, name of the column containing the longitudes (default is 'LON').
        - save_folder: str, folder where the images will be saved (default is 'Images').

        Returns:
        - df: The updated DataFrame with the 'IMG_FILE' column.
        """
        # Create a folder to save images if it doesn't already exist
        os.makedirs(save_folder, exist_ok=True)
        print(f"Images will be saved in: {save_folder}")

        # Dictionary to count occurrences of each latitude/longitude combination
        counter = {}

        # Iterate through the DataFrame row by row
        for index, row in df.iterrows():
            # Extract the image URL, latitude, and longitude from the current row
            image_url = row[url_column]
            latitude = row[lat_column]   # Updated reference
            longitude = row[lon_column]   # Updated reference

            # Create a unique key for the latitude and longitude
            location_key = f"{latitude}_{longitude}"

            # Increment the counter for each unique location
            if location_key in counter:
                counter[location_key] += 1
            else:
                counter[location_key] = 1

            # Define the image file name using latitude, longitude, and the counter
            image_name = os.path.join(save_folder, f"{latitude}_{longitude}_{counter[location_key]}.jpg")

            try:
                # Send a GET request to the image URL with a timeout of 10 seconds
                response = requests.get(image_url, timeout=10)

                # Check if the response indicates success (status code 200)
                if response.status_code == 200:
                    # Save the image content to the specified file
                    with open(image_name, 'wb') as file:
                        file.write(response.content)
                    print(f"Downloaded: {image_name}")

                    # Update the 'IMG_FILE' column in the DataFrame with the saved image path
                    df.at[index, 'IMG_FILE'] = image_name
                elif response.status_code == 400:
                    # Handle bad request errors
                    print(f"Skipping image from {image_url} (status code: {response.status_code})")
                else:
                    # Handle other unsuccessful status codes
                    print(f"Failed to download image from {image_url} (status code: {response.status_code})")
            except Exception as e:
                # Handle any exceptions that occur during the request
                print(f"Error downloading image from {image_url}: {e}")

        # If saving the updated DataFrame as a CSV file is desired, do that
        if save_csv:
            df.to_csv(csv_file_name, index=False)
            print(f"DataFrame saved to {csv_file_name}")

        return df
    # Example DataFrame setup (replace this with the actual DataFrame)
    # Call the function to download images from the DataFrame
    #df_p = download_images_from_df(df, save_csv=True, csv_file_name='geo_pic.csv')
    # Display the updated DataFrame
    #print(df_p)


In [14]:
df = download_images_from_df(df)

Images will be saved in: Images
Downloaded: Images/24.7451938_24.7451938_1.jpg
Downloaded: Images/24.7451938_24.7451938_2.jpg
Downloaded: Images/24.7451938_24.7451938_3.jpg
Downloaded: Images/24.7451938_24.7451938_4.jpg
Downloaded: Images/24.7451938_24.7451938_5.jpg
Downloaded: Images/24.7451938_24.7451938_6.jpg
Downloaded: Images/24.7451938_24.7451938_7.jpg
Downloaded: Images/24.7451938_24.7451938_8.jpg
Downloaded: Images/24.7368086_24.7368086_1.jpg
Downloaded: Images/24.7368086_24.7368086_2.jpg
Downloaded: Images/24.7368086_24.7368086_3.jpg
Downloaded: Images/24.7368086_24.7368086_4.jpg
Downloaded: Images/24.7368086_24.7368086_5.jpg
Downloaded: Images/24.7368086_24.7368086_6.jpg
Downloaded: Images/24.7368086_24.7368086_7.jpg
Downloaded: Images/24.7368086_24.7368086_8.jpg
Downloaded: Images/24.7368086_24.7368086_9.jpg
Downloaded: Images/24.7368086_24.7368086_10.jpg
Downloaded: Images/25.3170213_25.3170213_1.jpg
Downloaded: Images/25.3170213_25.3170213_2.jpg
Downloaded: Images/25.31702

In [17]:
def save_semi_clean_data(df, lat_column='latitude', lon_column='longitude', IMG_FILE_column='IMG_FILE', output_file='semi_clean_data.csv'):
        """
        Save a DataFrame with only latitude, longitude, and image path to a CSV file in the current directory.

        Parameters:
        - df: DataFrame containing the original data.
        - lat_column: str, name of the column containing latitudes (default is 'latitude').
        - lon_column: str, name of the column containing longitudes (default is 'longitude').
        - IMG_FILE_column: str, name of the column containing image paths (default is 'IMG_FILE').
        - output_file: str, name of the output CSV file (default is 'semi_clean_data.csv').
        """
        # select only the columns interested in
        semi_clean_df = df[[lat_column, lon_column, IMG_FILE_column]]

        # Get the current working directory  to save the file
        current_directory = os.getcwd()

        # Create the full path for the output file with the specified name
        output_path = os.path.join(current_directory, output_file)

        # Save the new DataFrame to a CSV file at the specified location
        semi_clean_df.to_csv(output_path, index=False)
        print(f"Semi-clean data saved to {output_path}")

    # Example usage of save_semi_clean_data():
    # I have 'geo_pic.csv' as my input file, which will be loaded into a DataFrame
    #file_path = "geo_pic.csv"
    #file_path = ""
    # Load the DataFrame from the CSV file
    #df = pd.read_csv(file_path)
    # Call the function to save the semi-clean data  just created
    #save_semi_clean_data(df_p)


In [18]:
save_semi_clean_data(df)

Semi-clean data saved to /Users/remaalnssiry/code/Remafsa/geo-pic/geo-pic/geoclip/semi_clean_data.csv


In [3]:
# Paths to your dataset
csv_file_path = '/Users/remaalnssiry/code/Remafsa/geo-pic/geo-pic/data/one_file.csv'
images_folder_path = '/Users/remaalnssiry/code/Remafsa/geo-pic/geo-pic/geoclip/Images'

In [4]:
from train.dataloader import *

train_transform = img_train_transform()

train_loader = GeoDataLoader(csv_file_path, images_folder_path, transform=img_train_transform())

Loading image paths and coordinates: 1it [00:00, 301.38it/s]


In [6]:
print(train_loader.__dict__)

{'dataset_folder': '/Users/remaalnssiry/code/Remafsa/geo-pic/geo-pic/geoclip/Images', 'transform': Compose(
    RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear, antialias=True)
    RandomHorizontalFlip(p=0.5)
    RandomApply(
    p=0.8
    ColorJitter(brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4), hue=(-0.1, 0.1))
)
    RandomGrayscale(p=0.2)
    PILToTensor()
    ConvertImageDtype()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
), 'images': [], 'coordinates': []}


In [57]:
from model import GeoCLIP
model = GeoCLIP()

  self.load_state_dict(torch.load(f"{file_dir}/weights/location_encoder_weights.pth"))
  self.image_encoder.mlp.load_state_dict(torch.load(f"{self.weights_folder}/image_encoder_mlp_weights.pth"))
  self.location_encoder.load_state_dict(torch.load(f"{self.weights_folder}/location_encoder_weights.pth"))
  self.logit_scale = nn.Parameter(torch.load(f"{self.weights_folder}/logit_scale_weights.pth"))


In [58]:
from train import *

In [59]:
import torch
import torch.nn as nn  # Importing nn for loss functions and models
import torch.optim as optim  # Importing optim for optimizers
from tqdm import tqdm  # Importing tqdm for progress bar

In [68]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set batch size
batch_size = 20  # Example batch size

# Define your optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Adjust learning rate as needed

# Number of epochs
num_epochs = 10  # Total number of epochs to train
epoch = 10
criterion = nn.CrossEntropyLoss()  # Initialize the loss function


In [69]:
train(train_loader, model, optimizer, epoch, batch_size, device, scheduler=None, criterion=nn.CrossEntropyLoss())

Starting Epoch 10


0it [00:00, ?it/s]
