# Install dependencies

In [1]:
! pip install --upgrade transformers --quiet
! pip install torch --upgrade --quiet
! pip install bitsandbytes accelerate --upgrade --quiet
! pip install einops --quiet
! pip install tqdm --quiet

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.4.0 requires torch==2.4.0, but you have torch 2.5.1 which is incompatible.
torchvision 0.19.0 requires torch==2.4.0, but you have torch 2.5.1 which is incompatible.[0m[31m
[0m

# Load Molmo

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig, BitsAndBytesConfig
from PIL import Image
import requests

load_in_8bit = False
load_in_4bit = True
mixed_precision = "bf16"

device_map = "auto"
torch_dtype = torch.bfloat16

quantization_config = BitsAndBytesConfig(
        load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
)

# load the processor
processor = AutoProcessor.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    device_map=device_map,
    torch_dtype=torch.bfloat16,
)

# load the model
model = AutoModelForCausalLM.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    quantization_config=quantization_config,
    device_map=device_map,
    torch_dtype=torch.bfloat16,
)




Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00003-of-00007.safetensors:  76%|#######6  | 3.60G/4.73G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/4.27G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/3.80G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

In [3]:
def prompt_molmo(image_path, artifacts):

  prompt = '''
  SYSTEM PROMPT:

  You are a helpful assistant that describes given errors and artifacts in images. Given the artifact, describe instances in the image where the error/artifact occurs.

  Strictly follow the json schema below when you respond.

  JSON schema:

  {"description": "..."}

  For example, if the user says:
  Here is the artifact:- Artifact_1

  You will give the user:-
  {"description": "<description of Artifact_1 in the image>"}

  DO NOT GIVE DESCRIPTION OF EXTRA ARTIFACTS.
  ONLY THE GIVEN ARTIFACT HAS TO BE DESCRIBED.
  
  Ensure that your description is precise. Restrict it to one or two lines.

  USER:
  Here is the artifact:- '''+artifacts


  img = Image.open(image_path)
  img = img.resize((128,128))
  inputs = processor.process(
      images=[img],
      text="{}".format(prompt),
  )
  inputs["images"] = inputs["images"].to(torch.bfloat16)

  inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
  with torch.no_grad():
          with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                  output = model.generate_from_batch(
                      inputs,
                      GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
                      tokenizer=processor.tokenizer
                  )

                  # only get generated tokens; decode them to text
                  generated_tokens = output[0,inputs['input_ids'].size(1):]
                  generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
                  return generated_text

In [4]:
sr_artifacts = [
    "Incorrect reflection mapping",
    "Abruptly cut off objects",
    "Ghosting effects: Semi-transparent duplicates of elements",
    "Dental anomalies in mammals",
    "Anatomically incorrect paw structures",
    "Unrealistic eye reflections",
    "Misshapen ears or appendages",
    "Unnatural pose artifacts",
    "Biological asymmetry errors",
    "Impossible foreshortening in animal bodies",
    "Impossible mechanical connections",
    "Impossible mechanical joints",
    "Physically impossible structural elements",
    "Incorrect wheel geometry",
    "Implausible aerodynamic structures",
    "Misaligned body panels",
    "Distorted window reflections",
    "Anatomically impossible joint configurations",
    "Non-manifold geometries in rigid structures",
    "Asymmetric features in naturally symmetric objects",
    "Misaligned bilateral elements in animal faces",
    "Irregular proportions in mechanical components",
    "Inconsistent scale of mechanical parts",
    "Incorrect perspective rendering",
    "Scale inconsistencies within single objects",
    "Spatial relationship errors",
    "Scale inconsistencies within the same object class",
    "Depth perception anomalies"
]
non_sr_artifacts = [
    "Improper fur direction flows",
    "Incorrect skin tones",
    "Inconsistent object boundaries",
    "Blurred boundaries in fine details",
    "Over-sharpening artifacts",
    "Excessive sharpness in certain image regions",
    "Aliasing along high-contrast edges",
    "Jagged edges in curved structures",
    "Fake depth of field",
    "Artificial depth of field in object presentation",
    "Discontinuous surfaces",
    "Unnaturally glossy surfaces",
    "Metallic surface artifacts",
    "Texture bleeding between adjacent regions",
    "Texture repetition patterns",
    "Over-smoothing of natural textures",
    "Regular grid-like artifacts in textures",
    "Artificial noise patterns in uniform surfaces",
    "Random noise patterns in detailed areas",
    "Repeated element patterns",
    "Systematic color distribution anomalies",
    "Unnatural color transitions",
    "Color coherence breaks",
    "Frequency domain signatures",
    "Artificial smoothness",
    "Cinematization effects",
    "Movie-poster-like composition of ordinary scenes",
    "Exaggerated characteristic features",
    "Synthetic material appearance",
    "Floating or disconnected components",
    "Inconsistent material properties",
    "Depth perception anomalies",
    "Loss of fine detail in complex structures",
    "Resolution inconsistencies within regions",
    "Abruptly cut off objects",
    "Unrealistic specular highlights",
]

In [5]:
# Load the JSON file
import json
with open("Artifacts_detected.json", "r") as file:
    results = json.load(file)


# Generate Artifact description

In [6]:
import pandas as pd
import re
import json

# Define the pattern
pattern = r'"description":\s*"([^"]*)"'

# Load data
data = pd.read_csv("Artifact_descriptions/Artifact_description_Tuple.csv")

# Initialize submission dictionary
submission = {}

# Define image folder
image_folder = "input_imgs"

# Image counter
image_counter = 0

# Process each image
for image in results.keys():
    image_explanation = {}
    if image in submission.keys():
        continue
    for artifact in results[image]:
        
        # Get artifact description
        desc = data['Description'].loc[data['Artifact'] == artifact]
        
        # Generate explanation based on artifact type
        if artifact in non_sr_artifacts:
            explanation = prompt_molmo(f"very_big_img/{image[:-4]}"+"_resized_DRCT-L_X4.png", artifact)
        else:
            explanation = prompt_molmo(f"very_big_img/{image[:-4]}"+"_resized_DRCT-L_X4.png", artifact)
        
        # Extract explanation from JSON-like response
        match = re.search(pattern, explanation)
        explanation = match.group(1) if match else "No description available"

        # Store explanation
        image_explanation[artifact] = explanation
    
    # Store processed explanations for the image
    submission[image] = image_explanation
    # Increment image counter
    image_counter += 1

    # Save every 10 images
    if len(submission) % 10 == 0:
        # with open(f"submission_{image_counter // 10}.json", "w") as f:
        #     json.dump(submission, f, indent=4)
        # 
        print(f"{len(submission)} images processed and saved!")

# Final save for any remaining images
if submission:
    with open(f"submission_final.json", "w") as f:
        json.dump(submission, f, indent=4)
    print(f"Final save complete! Total images processed: {image_counter}")


  with torch.cuda.amp.autocast(dtype=torch.bfloat16):


10 images processed and saved!
Final save complete! Total images processed: 10


In [7]:
import json
input_data = submission
output_data = []

# Process the input data
for key, value in input_data.items():
    # Remove the file extension from the image name
    index = key.split('.')[0]
    
    # Append the transformed structure
    output_data.append({
        "index": index,
        "explanation": value
    })

output_json = "/artifacts_with_description_submission.json"
with open(output_json, 'w') as json_file:
        json.dump(output_data, json_file, indent=4)
print("JSON dumped successfully, please check artifacts_with_description_submission.json")


JSON dumped successfully, please check artifacts_with_description_submission.json
