MOOC Challenge DSTL a binary file consisting of a collection of JPEG files with additional binary data in between is given. 

In [21]:
# Name the binary file we need to decode
Original_code = "dstl_MOOC_Challenge_v1.bin"


In [22]:
# Code to find jpeg markers and returns three lists, markers, trailers and lengths between
def find_jpeg_markers(filename):
    try:
        # Open the binary file in read mode
        with open(filename, 'rb') as file:
            data = file.read()

            # Create two empty lists to fill with markers
            markers = []
            trailers = []

            # Find the first JPEG marker
            ## JPEG
            marker_start = data.find(b'\xFF\xD8\xFF')

            # while we have a marker start address to work with, look for the trailer after this
            while marker_start != -1:
                # Find the corresponding PNG trailer after the marker
                trailer_start = data.find(b'\xFF\xD9', marker_start)

                # If both marker and trailer were found, add their addresses to the lists
                if trailer_start != -1:
                    # add to lists 
                    markers.append(marker_start)
                    trailers.append(trailer_start)

                    # start the process again one character after the trailer was found
                    marker_start = data.find(b'\xFF\xD8\xFF', trailer_start + 1)
                else:
                    break

            # If markers were found, calculate the lengths of the files between them
            if markers:
                # length of file is the trailer address minus the marker start address plus the length of the trailer marker
                # do this for each entry in both the trailer and marker lists
                lengths_between = [trailers[i] - markers[i] + len(b'\xFF\xD9') for i in range(len(markers))]
                
                # Return the lists of marker and trailer addresses along with the lengths
                return markers, trailers, lengths_between
            else:
                # If no markers were found, return None
                return None
    except FileNotFoundError: # in case there is no binary file with that name - error handling is important!
        print("File not found.")
        return None


Extract individual image files

In [23]:
# split into individual JPEGs by using find_jpeg_markers, and then using the position of each marker plus the lengths of each jpeg.
def split_jpegs(filename, output_prefix="output"):
    # Find JPEG markers and trailers
    result = find_jpeg_markers(filename)

    if result is not None:
        markers, trailers, lengths_between = result

        # Open the binary file
        with open(filename, 'rb') as file:
            for i in range(len(markers)):
                # Seek to the position after the start marker
                file.seek(markers[i] + 2)  # Skip the \xFF\xD8 bytes

                # Read and extract the JPEG data between the markers
                jpeg_data = file.read(lengths_between[i] - 2)  # Exclude the start marker

                # Create a separate JPEG file with a custom name
                output_filename = f'{output_prefix}_{i+1}.jpg'
                with open(output_filename, 'wb') as jpeg_file:
                    jpeg_file.write(b'\xFF\xD8')  # Write the start marker of the JPEG
                    jpeg_file.write(jpeg_data)    # Write the extracted data
                    jpeg_file.write(b'\xFF\xD9')  # Write the end marker f the JPEG
                
                print(f"Extracted JPG saved as '{output_filename}'")

        print(f"{len(markers)} JPG files have been extracted.")
    else:
        print("No JPG markers found in the file.")

split_jpegs(Original_code)

Extracted JPG saved as 'output_1.jpg'
Extracted JPG saved as 'output_2.jpg'
Extracted JPG saved as 'output_3.jpg'
Extracted JPG saved as 'output_4.jpg'
Extracted JPG saved as 'output_5.jpg'
Extracted JPG saved as 'output_6.jpg'
Extracted JPG saved as 'output_7.jpg'
Extracted JPG saved as 'output_8.jpg'
Extracted JPG saved as 'output_9.jpg'
Extracted JPG saved as 'output_10.jpg'
Extracted JPG saved as 'output_11.jpg'
Extracted JPG saved as 'output_12.jpg'
Extracted JPG saved as 'output_13.jpg'
Extracted JPG saved as 'output_14.jpg'
Extracted JPG saved as 'output_15.jpg'
Extracted JPG saved as 'output_16.jpg'
Extracted JPG saved as 'output_17.jpg'
Extracted JPG saved as 'output_18.jpg'
Extracted JPG saved as 'output_19.jpg'
Extracted JPG saved as 'output_20.jpg'
Extracted JPG saved as 'output_21.jpg'
Extracted JPG saved as 'output_22.jpg'
Extracted JPG saved as 'output_23.jpg'
Extracted JPG saved as 'output_24.jpg'
Extracted JPG saved as 'output_25.jpg'
Extracted JPG saved as 'output_26.

Extract binary between image files

In [24]:
def extract_non_jpeg_binary(input_file_name, output_file_name):
    try:
        # Read the content of the input binary file
        with open(input_file_name, 'rb') as input_file:
            input_data = input_file.read()
        
        non_jpeg_data = b""  # Initialize an empty binary string for non-JPEG data

        # Split the binary data based on JPEG magic numbers (SOI and EOI)
        jpeg_magic_start = b"\xFF\xD8"  # Start of Image (SOI) marker
        jpeg_magic_end = b"\xFF\xD9"    # End of Image (EOI) marker

        start_index = 0
        while True:
            # Find the start and end indices of JPEG data
            jpeg_start = input_data.find(jpeg_magic_start, start_index)
            jpeg_end = input_data.find(jpeg_magic_end, start_index)
            
            if jpeg_start == -1 or jpeg_end == -1:
                break
            
            # Append the non-JPEG data between JPEG images
            non_jpeg_data += input_data[start_index:jpeg_start]
            start_index = jpeg_end + 2  # Move the start index to the end of the current JPEG
            
        # Write the non-JPEG binary data to the output file
        with open(output_file_name, 'wb') as output_file:
            output_file.write(non_jpeg_data)
        
        print(f'Non-JPEG data extracted and saved to {output_file_name}')
    
    except FileNotFoundError as e:
        print(f"Error: {e}")

# Now extract the binary data from our Original_code 

extract_non_jpeg_binary(Original_code, 'non_jpeg_challenge.bin')


Non-JPEG data extracted and saved to non_jpeg_challenge.bin


Having identified MP3 markers, extract the MP3 files in the same way as we extracted JPG files

In [25]:
# Code to find MP3 markers and returns three lists, markers, trailers and lengths between
def find_MP3_markers(filename):
    try:
        # Open the binary file in read mode
        with open(filename, 'rb') as file:
            data = file.read()

            # Create two empty lists to fill with markers
            markers = []
            trailers = []

            # Find the first MP3 marker
            ## MP3
            marker_start = data.find(b'\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xBB\xFF\xBB\xFF\xBB\xFF\xBB')

            # while we have a marker start address to work with, look for the trailer after this
            while marker_start != -1:
                # Find the corresponding MP3 trailer after the marker
                trailer_start = data.find(b'\x28\x03\x4D\x42\x00\x00\x4C\x42\x00\x00\x70\x41\x6E\xA3\x31\x41\x9B\x8E\xD8\xBF\x00\x00\x80\xBF\x00\x00\x24\x42\x6D\x56\xF5\x41\x00\x00\xFA\x43\x00\x7B\x92\x4C\x42\x00\x00\x4C\x42\x00\x00\x00\x41\xC6\xDC\x0B\x42\x79\xEA\xED\xBF\x00\x00\x80\xBF\x00\x00\x4C\x42\x12\x14\xFB\x41\x00\x80\x3B\x45\x00\x5A\x86\x4C\x42\x00\x00\x4C\x42\x00\x00\xE0\x40\x82\x47\x57\x42\x6F\x11\xDA\xBF\x00\x00\x80\xBF\x00\x00\x28\x42\xFC\xB9\x91\x41\x00\x00\x7A\x45\x01\x97\x1B\x4D\x42\x00\x00\x4C\x42\x00\x00\x80\x41\x5D\xFE\x13\x42\x49\xF4\xEC\xBF\x00\x00\x80\xBF\x00\x00\x4C\x42\xE5\xD0\x8A\x40\x00\x00\x48\x43\x00', marker_start)

                # If both marker and trailer were found, add their addresses to the lists
                if trailer_start != -1:
                    # add to lists 
                    markers.append(marker_start)
                    trailers.append(trailer_start)

                    # start the process again one character after the trailer was found
                    marker_start = data.find(b'\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xBB\xFF\xBB\xFF\xBB\xFF\xBB', trailer_start + 1)
                else:
                    break

            # If markers were found, calculate the lengths of the files between them
            if markers:
                # length of file is the trailer address minus the marker start address plus the length of the trailer marker
                # do this for each entry in both the trailer and marker lists
                lengths_between = [trailers[i] - markers[i] + len(b'\x28\x03\x4D\x42\x00\x00\x4C\x42\x00\x00\x70\x41\x6E\xA3\x31\x41\x9B\x8E\xD8\xBF\x00\x00\x80\xBF\x00\x00\x24\x42\x6D\x56\xF5\x41\x00\x00\xFA\x43\x00\x7B\x92\x4C\x42\x00\x00\x4C\x42\x00\x00\x00\x41\xC6\xDC\x0B\x42\x79\xEA\xED\xBF\x00\x00\x80\xBF\x00\x00\x4C\x42\x12\x14\xFB\x41\x00\x80\x3B\x45\x00\x5A\x86\x4C\x42\x00\x00\x4C\x42\x00\x00\xE0\x40\x82\x47\x57\x42\x6F\x11\xDA\xBF\x00\x00\x80\xBF\x00\x00\x28\x42\xFC\xB9\x91\x41\x00\x00\x7A\x45\x01\x97\x1B\x4D\x42\x00\x00\x4C\x42\x00\x00\x80\x41\x5D\xFE\x13\x42\x49\xF4\xEC\xBF\x00\x00\x80\xBF\x00\x00\x4C\x42\xE5\xD0\x8A\x40\x00\x00\x48\x43\x00') for i in range(len(markers))]
                
                # Return the lists of marker and trailer addresses along with the lengths
                return markers, trailers, lengths_between
            else:
                # If no markers were found, return None
                return None
    except FileNotFoundError: # in case there is no binary file with that name - error handling is important!
        print("File not found.")
        return None


Extract each individual MP3 file

In [26]:
# split into individual MP3s by using find_MP3_markers, and then using the position of each marker plus the lengths of each jpeg.
def split_MP3s(filename, output_prefix="output"):
    # Find MP3 markers and trailers
    result = find_MP3_markers(filename)

    if result is not None:
        markers, trailers, lengths_between = result

        # Open the binary file
        with open(filename, 'rb') as file:
            for i in range(len(markers)):
                # Seek to the position after the start marker
                file.seek(markers[i] + 2)  # Skip the \xFF\xD8 bytes

                # Read and extract the JPEG data between the markers
                MP3_data = file.read(lengths_between[i] - 2)  # Exclude the start marker

                # Create a separate MP3 file with a custom name
                output_filename = f'{output_prefix}_{i+1}.MP3'
                with open(output_filename, 'wb') as MP3_file:
                    MP3_file.write(b'\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xBB\xFF\xBB\xFF\xBB\xFF\xBB') # Write the start marker of the MP3
                    MP3_file.write(MP3_data)    # Write the extracted data
                    MP3_file.write(b'\x28\x03\x4D\x42\x00\x00\x4C\x42\x00\x00\x70\x41\x6E\xA3\x31\x41\x9B\x8E\xD8\xBF\x00\x00\x80\xBF\x00\x00\x24\x42\x6D\x56\xF5\x41\x00\x00\xFA\x43\x00\x7B\x92\x4C\x42\x00\x00\x4C\x42\x00\x00\x00\x41\xC6\xDC\x0B\x42\x79\xEA\xED\xBF\x00\x00\x80\xBF\x00\x00\x4C\x42\x12\x14\xFB\x41\x00\x80\x3B\x45\x00\x5A\x86\x4C\x42\x00\x00\x4C\x42\x00\x00\xE0\x40\x82\x47\x57\x42\x6F\x11\xDA\xBF\x00\x00\x80\xBF\x00\x00\x28\x42\xFC\xB9\x91\x41\x00\x00\x7A\x45\x01\x97\x1B\x4D\x42\x00\x00\x4C\x42\x00\x00\x80\x41\x5D\xFE\x13\x42\x49\xF4\xEC\xBF\x00\x00\x80\xBF\x00\x00\x4C\x42\xE5\xD0\x8A\x40\x00\x00\x48\x43\x00')  # Write the end marker of the MP3
                
                print(f"Extracted MP3 saved as '{output_filename}'")

        print(f"{len(markers)} MP3 files have been extracted.")
    else:
        print("No MP3 markers found in the file.")

split_MP3s("non_jpeg_challenge.bin")

Extracted MP3 saved as 'output_1.MP3'
Extracted MP3 saved as 'output_2.MP3'
Extracted MP3 saved as 'output_3.MP3'
Extracted MP3 saved as 'output_4.MP3'
Extracted MP3 saved as 'output_5.MP3'
Extracted MP3 saved as 'output_6.MP3'
Extracted MP3 saved as 'output_7.MP3'
Extracted MP3 saved as 'output_8.MP3'
Extracted MP3 saved as 'output_9.MP3'
Extracted MP3 saved as 'output_10.MP3'
Extracted MP3 saved as 'output_11.MP3'
Extracted MP3 saved as 'output_12.MP3'
Extracted MP3 saved as 'output_13.MP3'
Extracted MP3 saved as 'output_14.MP3'
Extracted MP3 saved as 'output_15.MP3'
Extracted MP3 saved as 'output_16.MP3'
Extracted MP3 saved as 'output_17.MP3'
Extracted MP3 saved as 'output_18.MP3'
Extracted MP3 saved as 'output_19.MP3'
Extracted MP3 saved as 'output_20.MP3'
Extracted MP3 saved as 'output_21.MP3'
Extracted MP3 saved as 'output_22.MP3'
Extracted MP3 saved as 'output_23.MP3'
Extracted MP3 saved as 'output_24.MP3'
Extracted MP3 saved as 'output_25.MP3'
Extracted MP3 saved as 'output_26.

Now Extract the remaining data between the MP3 files

In [27]:
def extract_non_MP3_binary(input_file_name, output_file_name):
    try:
        # Read the content of the input binary file
        with open(input_file_name, 'rb') as input_file:
            input_data = input_file.read()
        
        non_MP3_data = b""  # Initialize an empty binary string for non-JPEG data

        # Split the binary data based on JPEG magic numbers (SOI and EOI)
        MP3_magic_start = b"\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xAA\xFF\xBB\xFF\xBB\xFF\xBB\xFF\xBB"  # Start of MP3 marker
        MP3_magic_end = b"\x28\x03\x4D\x42\x00\x00\x4C\x42\x00\x00\x70\x41\x6E\xA3\x31\x41\x9B\x8E\xD8\xBF\x00\x00\x80\xBF\x00\x00\x24\x42\x6D\x56\xF5\x41\x00\x00\xFA\x43\x00\x7B\x92\x4C\x42\x00\x00\x4C\x42\x00\x00\x00\x41\xC6\xDC\x0B\x42\x79\xEA\xED\xBF\x00\x00\x80\xBF\x00\x00\x4C\x42\x12\x14\xFB\x41\x00\x80\x3B\x45\x00\x5A\x86\x4C\x42\x00\x00\x4C\x42\x00\x00\xE0\x40\x82\x47\x57\x42\x6F\x11\xDA\xBF\x00\x00\x80\xBF\x00\x00\x28\x42\xFC\xB9\x91\x41\x00\x00\x7A\x45\x01\x97\x1B\x4D\x42\x00\x00\x4C\x42\x00\x00\x80\x41\x5D\xFE\x13\x42\x49\xF4\xEC\xBF\x00\x00\x80\xBF\x00\x00\x4C\x42\xE5\xD0\x8A\x40\x00\x00\x48\x43\x00"    # End of MP3 marker

        start_index = 0
        while True:
            # Find the start and end indices of MP3 data
            MP3_start = input_data.find(MP3_magic_start, start_index)
            MP3_end = input_data.find(MP3_magic_end, start_index)
            
            if MP3_start == -1 or MP3_end == -1:
                break
            
            # Append the non-MP3 data between MP3 images
            non_MP3_data += input_data[start_index:MP3_start]
            start_index = MP3_end + 2  # Move the start index to the end of the current MP3
            
        # Write the non-MP3 binary data to the output file
        with open(output_file_name, 'wb') as output_file:
            output_file.write(non_MP3_data)
        
        print(f'Non-MP3 data extracted and saved to {output_file_name}')
    
    except FileNotFoundError as e:
        print(f"Error: {e}")

# Now extract the binary data from binary data with JPEGs removed.

extract_non_MP3_binary("non_jpeg_challenge.bin", 'non_MP3_challenge.bin')


Non-MP3 data extracted and saved to non_MP3_challenge.bin


non_MP3_chall

JSON to GPX for viewing

Create video

In [28]:
import json #import module

# Extracted protocol data
#taking a python dictionary and coverting it to a JSON object
protocol_data = { #this represents extracted data/rules
#key - value pairs 
#keys are just the names "ExtractedData1..." and values can be anything (ints/strings etc)
#This set of rules represents no real data and is just saying that the keys are named this and the data type found can be of a string(value1) or a int etc(42)
    "ExtractedData1": "value1",
    "ExtractedData2": 42,
    "ExtractedData3": 41,
    "ExtractedData4": ["item1", "item2"],
    # we can add more fields as needed
}

# Create a JSON object from the extracted data
json_data = json.dumps(protocol_data, indent=4) #converting python into json readable #indent=4 is for adding nesting (more human readable)

# Save the JSON to a file
with open("output.json", "w") as json_file: #so the data is viewable but this is just the rules for now
    json_file.write(json_data)

In [30]:
import re
import json

# Read the hex dump from a file
with open("non_MP3_challenge.bin", "rb") as file:
    hex_dump = file.read()

start_sync_word = b'\x4D\x42\x00\x00\x4C\x42'
end_sync_words = [b'\x00\x00\x7A\x45\x01\x97\x1B', b'\x00\x00\x48\x43\x00']

# Split data into blocks using the start and end sync words
data_blocks = re.split(b'|'.join([re.escape(start_sync_word), re.escape(end_sync_words[0]), re.escape(end_sync_words[1])]), hex_dump)

# Define a function to convert bytes to integers, strings, etc.
def bytes_to_int(data_bytes):
    return int.from_bytes(data_bytes, byteorder='big')

parsed_data = []


# ...
for block in data_blocks[1:]:  # Skip the first empty block
    if len(block) >= 40:  # Ensure the block is at least 40 bytes long
        extracted_data = []
        for i in range(10):  # Extract 10 "ExtractedData" fields
            data_bytes = block[i * 4:(i + 1) * 4]
            extracted_data.append(bytes_to_int(data_bytes))

        # Create a JSON object for this data block
        json_data = {f"ExtractedData{i + 1}": value for i, value in enumerate(extracted_data)}

        parsed_data.append(json_data)
# ...
# Serialize the JSON data
json_string = json.dumps(parsed_data, indent=4)

# Save or process the JSON data
print(json_string)  # Print JSON data for demonstration


# Save the JSON data to a file
with open("extracted_10.json", "w") as json_file:
    json_file.write(json_string)

print("JSON data saved to 'extracted_10.json'")


[
    {
        "ExtractedData1": 28737,
        "ExtractedData2": 1856188737,
        "ExtractedData3": 2609830079,
        "ExtractedData4": 32959,
        "ExtractedData5": 9282,
        "ExtractedData6": 1834415425,
        "ExtractedData7": 64067,
        "ExtractedData8": 8098380,
        "ExtractedData9": 1107296332,
        "ExtractedData10": 1107296256
    },
    {
        "ExtractedData1": 28737,
        "ExtractedData2": 1856188737,
        "ExtractedData3": 2609830079,
        "ExtractedData4": 32959,
        "ExtractedData5": 9282,
        "ExtractedData6": 1834415425,
        "ExtractedData7": 64067,
        "ExtractedData8": 8098380,
        "ExtractedData9": 1107296332,
        "ExtractedData10": 1107296256
    },
    {
        "ExtractedData1": 28737,
        "ExtractedData2": 1856188737,
        "ExtractedData3": 2609830079,
        "ExtractedData4": 32959,
        "ExtractedData5": 9282,
        "ExtractedData6": 1834415425,
        "ExtractedData7": 64067,
        "

In [33]:
import re
import json

# Read the hex dump from a file
with open("non_MP3_challenge.bin", "rb") as file:
    hex_dump = file.read()

start_sync_word = b'\x4D\x42\x00\x00\x4C\x42'
end_sync_words = [b'\x00\x00\x7A\x45\x01\x97\x1B', b'\x00\x00\x48\x43\x00']

# Split data into blocks using the start and end sync words
data_blocks = re.split(b'|'.join([re.escape(start_sync_word), re.escape(end_sync_words[0]), re.escape(end_sync_words[1])]), hex_dump)

# Define a function to convert bytes to integers, strings, etc.
def bytes_to_int(data_bytes):
    return int.from_bytes(data_bytes, byteorder='big')

parsed_data = []


# ...
for block in data_blocks[1:]:  # Skip the first empty block
    if len(block) <= 39:  # Ensure the block is at least 40 bytes long
        extracted_data = []
        for i in range(10):  # Extract 10 "ExtractedData" fields
            data_bytes = block[i * 4:(i + 1) * 4]
            extracted_data.append(bytes_to_int(data_bytes))

        # Create a JSON object for this data block
        json_data = {f"ExtractedData{i + 1}": value for i, value in enumerate(extracted_data)}

        parsed_data.append(json_data)
# ...
# Serialize the JSON data
json_string = json.dumps(parsed_data, indent=4)

# Save or process the JSON data
print(json_string)  # Print JSON data for demonstration


# Save the JSON data to a file
with open("extracted_small.json", "w") as json_file:
    json_file.write(json_string)

print("JSON data saved to 'extracted_small.json'")

[
    {
        "ExtractedData1": 0,
        "ExtractedData2": 0,
        "ExtractedData3": 0,
        "ExtractedData4": 0,
        "ExtractedData5": 0,
        "ExtractedData6": 0,
        "ExtractedData7": 0,
        "ExtractedData8": 0,
        "ExtractedData9": 0,
        "ExtractedData10": 0
    },
    {
        "ExtractedData1": 32833,
        "ExtractedData2": 1576932162,
        "ExtractedData3": 1240788159,
        "ExtractedData4": 32959,
        "ExtractedData5": 19522,
        "ExtractedData6": 3855649344,
        "ExtractedData7": 0,
        "ExtractedData8": 0,
        "ExtractedData9": 0,
        "ExtractedData10": 0
    },
    {
        "ExtractedData1": 0,
        "ExtractedData2": 0,
        "ExtractedData3": 0,
        "ExtractedData4": 0,
        "ExtractedData5": 0,
        "ExtractedData6": 0,
        "ExtractedData7": 0,
        "ExtractedData8": 0,
        "ExtractedData9": 0,
        "ExtractedData10": 0
    },
    {
        "ExtractedData1": 0,
        "Extrac

In [32]:
import json

# Read the JSON file
with open("extracted_10.json", "r") as json_file:
    json_data = json.load(json_file)

# Check if the JSON data is an array
if isinstance(json_data, list):
    num_blocks = len(json_data)
    print(f"Number of data blocks in the JSON file: {num_blocks}")
else:
    print("The JSON data is not in an array format.")

# Now you have the count of separate data blocks in the JSON file.

Number of data blocks in the JSON file: 87


In [34]:
import json

# Read the JSON file
with open("extracted_small.json", "r") as json_file:
    json_data = json.load(json_file)

# Check if the JSON data is an array
if isinstance(json_data, list):
    num_blocks = len(json_data)
    print(f"Number of data blocks in the JSON file: {num_blocks}")
else:
    print("The JSON data is not in an array format.")

# Now you have the count of separate data blocks in the JSON file.

Number of data blocks in the JSON file: 261


In [36]:
import re
import os

# Read the hex dump from a file
with open("dstl_MOOC_Challenge_v1.bin", "rb") as file:
    hex_dump = file.read()

# Create a directory to save the extracted bitmaps
output_directory = "bitmap_images"
os.makedirs(output_directory, exist_ok=True)

# Use regular expressions to find bitmap data
bitmap_data = re.findall(b'\x42\x4D(.*?)(?=\x42\x4D|\Z)', hex_dump, re.DOTALL)

for i, data in enumerate(bitmap_data):
    # Create a bitmap file by adding the bitmap file header (BM) to the data
    bitmap_file = b'\x42\x4D' + data
    
    with open(os.path.join(output_directory, f"bitmap_{i}.bmp"), "wb") as bmp_file:
        bmp_file.write(bitmap_file)

print(f"Extracted {len(bitmap_data)} bitmaps to {output_directory}")


Extracted 199 bitmaps to bitmap_images


In [1]:
# Create some example binary to demonstrate how to find markers and trailers

# Define the PNG marker and trailer
png_marker = b'\xFF\xD8\xFF'
png_trailer = b'\xFF\xD9'

# Create a binary file and write the PNG marker and trailer to it
with open('test.bin', 'wb') as file:
    file.write(png_marker)
    # Add some data between the marker and trailer (optional)
    file.write(b'Hello, this is some data between the marker and trailer.')
    file.write(png_trailer)

print("Test PNG file 'test.bin' has been created.")

Test PNG file 'test.bin' has been created.


In [2]:
def find_jpeg_markers(filename):
    try:
        # Open the binary file in read mode
        with open(filename, 'rb') as file:
            data = file.read()

            # Find the address of the JPEG marker using .find() method
            jpeg_marker = data.find(b'\xFF\xD8')  # Start of Image (SOI) marker

            # Find the address of the JPEG trailer using .rfind() method - looks from the end of the document.
            jpeg_trailer = data.rfind(b'\xFF\xD9')  # End of Image (EOI) marker

            # If both markers were found, calculate the length between them
            # Note: -1 is returned if .find() doesn't find anything 
            if jpeg_marker != -1 and jpeg_trailer != -1:
                length_between = jpeg_trailer - jpeg_marker + len(b'\xFF\xD9')

                # Return the addresses and length
                return jpeg_marker, jpeg_trailer, length_between
            else:
                # If one or both markers were not found, return None
                return None
    except FileNotFoundError:
        print("File not found.")
        return None


In [3]:
# Using this function on our  example file:

filename = 'test.bin'  # Replace with the path to your binary file
result = find_jpeg_markers(filename)

if result is not None:
    marker_address, trailer_address, length_between = result
    print(f"jpeg Marker Address: {marker_address}")
    print(f"jpeg Trailer Address: {trailer_address}")
    print(f"Length between markers and trailer: {length_between} bytes")

jpeg Marker Address: 0
jpeg Trailer Address: 59
Length between markers and trailer: 61 bytes


In [11]:
# Define the JPEG marker and trailer
jpeg_marker = b'\xFF\xD8'
jpeg_trailer = b'\xFF\xD9'

# Create a binary file and write the JPEG marker and trailer to it
with open('test_jpeg.bin', 'wb') as file:
    # Add the first JPEG file
    file.write(jpeg_marker)
    # Add some data between the marker and trailer (optional)
    file.write(b'Hello, this is some data between the marker and trailer for the first JPEG file.')
    file.write(jpeg_trailer)

    # Add another JPEG file
    file.write(jpeg_marker)
    # Add some data between the marker and trailer (optional)
    file.write(b'Hello, this is some data between the marker and trailer for the second JPEG file.')
    file.write(jpeg_trailer)

print("Test_jpeg binary file 'test_jpeg.bin' has been created with multiple JPEG files.")


Test_jpeg binary file 'test_jpeg.bin' has been created with multiple JPEG files.


In [13]:
import sys

def find_jpeg_markers(filename):
    try:
        with open(filename, 'rb') as file:
            data = file.read()

            jpeg_marker = b'\xFF\xD8'  # Start of Image (SOI) marker
            jpeg_trailer = b'\xFF\xD9'  # End of Image (EOI) marker

            marker_positions = [i for i in range(len(data)) if data[i:i+2] == jpeg_marker]
            trailer_positions = [i for i in range(len(data)) if data[i:i+2] == jpeg_trailer]

            if marker_positions and trailer_positions:
                print("JPEG markers found at positions:", marker_positions)
                print("JPEG trailers found at positions:", trailer_positions)
                return
            else:
                print("No JPEG markers or trailers found in the file.")
    except FileNotFoundError:
        print("File not found.")

if len(sys.argv) == 2:
    find_jpeg_markers(sys.argv[1])
else:
    print("Usage: python find_jpeg_markers.py your_binary_file.bin")


Usage: python find_jpeg_markers.py your_binary_file.bin


In [15]:
# Using this function on our our second example file:
# Unfortunately it only finds the very first and very last addresses, not the individual files! We ahve to take a different approach:

filename = 'dstl_MOOC_Challenge_v1.bin'  # Replace with the path to your binary file
result = find_jpeg_markers(filename)

if result is not None:
    marker_address, trailer_address, length_between = result
    print(f"PNG Marker Address: {marker_address}")
    print(f"PNG Trailer Address: {trailer_address}")
    print(f"Length between markers and trailer: {length_between} bytes")

JPEG markers found at positions: [16, 167428, 335534, 503452, 671375, 839300, 1006519, 1173398, 1340910, 1508005, 1674711, 1841695, 2008846, 2177017, 2344790, 2513338, 2681382, 2849835, 3018046, 3185284, 3351983, 3518322, 3684529, 3850764, 4016468, 4181915, 4349347, 4517498, 4685650, 4854220, 5021788, 5189509, 5356509, 5523480, 5690686, 5858015, 6025540, 6193201, 6361005, 6530783, 6700309, 6869482, 7039115, 7208799, 7378178, 7546763, 7713895, 7880806, 8047359, 8214021, 8381180, 8548637, 8715974, 8883313, 9050589, 9217793, 9385973, 9553373, 9720519, 9887819, 10054331, 10219811, 10385382, 10551122, 10717046, 10883159, 11049725, 11216442, 11383298, 11550586, 11717727, 11884937, 12051803, 12218402, 12384248, 12550251, 12716652, 12883807, 13051770, 13220598, 13388994, 13556841, 13724691, 13892553, 14060068, 14223901, 14386495, 14548865, 14711329]
JPEG trailers found at positions: [167187, 335293, 503211, 671134, 839059, 1006278, 1173157, 1340669, 1507764, 1674470, 1841454, 2008605, 2176776,

In [None]:
# Using this function on our our second example file:
# Unfortunately it only finds the very first and very last addresses, not the individual files! We ahve to take a different approach:

filename = 'test_2.bin'  # Replace with the path to your binary file
result = find_png_markers(filename)

if result is not None:
    marker_address, trailer_address, length_between = result
    print(f"PNG Marker Address: {marker_address}")
    print(f"PNG Trailer Address: {trailer_address}")
    print(f"Length between markers and trailer: {length_between} bytes")

In [16]:
# Create some example binary to demonstrate how to find markers and trailers

# Define the PNG marker and trailer
png_marker = b'\xFF\xAA\xFF\xAA\xFF\xAA\xFF'
png_trailer = b'\xFF\xBB\xFF\xBB\xFF\xBB'

# Create a binary file and write the PNG marker and trailer to it
with open('practice', 'wb') as file:
    file.write(png_marker)
    # Add some data between the marker and trailer (optional)
    file.write(b'Hello, this is some data between the marker and trailer.')
    file.write(png_trailer)

print("Test PNG file 'test.bin' has been created.")

Test PNG file 'test.bin' has been created.


In [23]:
def more(filename):
    try:
        # Open the binary file in read mode
        with open(filename, 'rb') as file:
            data = file.read()

            # Find the address of the JPEG marker using .find() method
            jpeg_marker = data.find(b'\xFF\xAA\xFF\xAA\xFF\xAA\xFF')  # Start of Image (SOI) marker

            # Find the address of the JPEG trailer using .rfind() method - looks from the end of the document.
            jpeg_trailer = data.rfind(b'\xFF\xBB\xFF\xBB\xFF\xBB')  # End of Image (EOI) marker

            # If both markers were found, calculate the length between them
            # Note: -1 is returned if .find() doesn't find anything 
            if jpeg_marker != -1 and jpeg_trailer != -1:
                length_between = jpeg_trailer - jpeg_marker + len(b'\xFF\xBB\xFF\xBB\xFF\xBB')

                # Return the addresses and length
                return jpeg_marker, jpeg_trailer, length_between
            else:
                # If one or both markers were not found, return None
                return None
    except FileNotFoundError:
        print("File not found.")
        return None


In [24]:
import sys

def find_jpeg(filename):
    try:
        with open(filename, 'rb') as file:
            data = file.read()

            jpeg_marker = b'\xFF\xAA\xFF\xAA\xFF\xAA\xFF'  # Start of Image (SOI) marker
            jpeg_trailer = b'\xFF\xBB\xFF\xBB\xFF\xBB'  # End of Image (EOI) marker

            marker_positions = [i for i in range(len(data)) if data[i:i+2] == jpeg_marker]
            trailer_positions = [i for i in range(len(data)) if data[i:i+2] == jpeg_trailer]

            if marker_positions and trailer_positions:
                print("JPEG markers found at positions:", marker_positions)
                print("JPEG trailers found at positions:", trailer_positions)
                return
            else:
                print("No JPEG markers or trailers found in the file.")
    except FileNotFoundError:
        print("File not found.")

if len(sys.argv) == 2:
    find_jpeg(sys.argv[1])
else:
    print("Usage: python find_jpeg_markers.py your_binary_file.bin")


Usage: python find_jpeg_markers.py your_binary_file.bin


In [25]:
# Using this function on our our second example file:
# Unfortunately it only finds the very first and very last addresses, not the individual files! We ahve to take a different approach:

filename = 'non_jpeg_challenge.bin'  # Replace with the path to your binary file
result = find_jpeg(filename)

if result is not None:
    marker_address, trailer_address, length_between = result
    print(f"PNG Marker Address: {marker_address}")
    print(f"PNG Trailer Address: {trailer_address}")
    print(f"Length between markers and trailer: {length_between} bytes")

No JPEG markers or trailers found in the file.


In [30]:
# Function to split a binary file containing multiple JPEGs into individual JPEG files with custom names
def split_jpegs(filename, output_prefix="output"):
    # Find JPEG markers and trailers
    result = find_jpeg_markers(filename)

    if result is not None:
        markers, trailers, lengths_between = result

        # Open the binary file
        with open(filename, 'rb') as file:
            for i in range(len(markers)):
                # Seek to the position after the start marker - remembering to include the length of the start marker!
                file.seek(markers[i] + len(b'\xFF\xD8'))

                # Read and extract the JPEG data between the markers
                jpeg_data = file.read(lengths_between[i] - len(b'\xFF\xD8'))

                # Create a separate JPEG file with a custom name
                output_filename = f'{output_prefix}_{i+1}.jpeg'  # Use .jpeg extension for JPEG files
                with open(output_filename, 'wb') as jpeg_file:
                    jpeg_file.write(b'\xFF\xD8')  # Write the JPEG marker to begin
                    jpeg_file.write(jpeg_data)    # Write the extracted data
                    jpeg_file.write(b'\xFF\xD9')  # Write the JPEG trailer

                print(f"Extracted JPEG saved as '{output_filename}'")

        print(f"{len(markers)} JPEG files have been extracted.")
    else:
        print("No JPEG markers found in the file.")



In [32]:
split_jpegs('dstl_MOOC_Challenge_v1.bin')

JPEG markers found at positions: [16, 167428, 335534, 503452, 671375, 839300, 1006519, 1173398, 1340910, 1508005, 1674711, 1841695, 2008846, 2177017, 2344790, 2513338, 2681382, 2849835, 3018046, 3185284, 3351983, 3518322, 3684529, 3850764, 4016468, 4181915, 4349347, 4517498, 4685650, 4854220, 5021788, 5189509, 5356509, 5523480, 5690686, 5858015, 6025540, 6193201, 6361005, 6530783, 6700309, 6869482, 7039115, 7208799, 7378178, 7546763, 7713895, 7880806, 8047359, 8214021, 8381180, 8548637, 8715974, 8883313, 9050589, 9217793, 9385973, 9553373, 9720519, 9887819, 10054331, 10219811, 10385382, 10551122, 10717046, 10883159, 11049725, 11216442, 11383298, 11550586, 11717727, 11884937, 12051803, 12218402, 12384248, 12550251, 12716652, 12883807, 13051770, 13220598, 13388994, 13556841, 13724691, 13892553, 14060068, 14223901, 14386495, 14548865, 14711329]
JPEG trailers found at positions: [167187, 335293, 503211, 671134, 839059, 1006278, 1173157, 1340669, 1507764, 1674470, 1841454, 2008605, 2176776,

In [34]:
# Initialize a list to store the user data
users = []

# Open the binary file and read the data
with open('non_jpeg_challenge.bin', 'rb') as file:
    while True:
        # Read data from the file
        data = file.read(4)  # Assuming data is stored in 4-byte chunks, adjust as needed

        # Break the loop if there's no more data to read
        if not data:
            break

        # Process the data (you need to know the data format in the binary file)
        # Example: assuming the data is an integer
        user_id = int.from_bytes(data, byteorder='little')

        # Append the user data to the list
        users.append(user_id)

# Now, the 'users' list contains the data read from the binary file
# You can iterate through 'users' to process the data further
for user in users:
    print(user)





2868882175
2868882175
2868882175
2868882175
3154099199
3154099199
1112301044
3219583135
210
3989
1094839173
0
858927154
758133037
824195120
942815801
3224378
4344832
2235604992
4321914
12550144
3510779904
675377911
4345091
4344832
1849782272
2604741027
12572814
12550144
1833051136
4322646
4454912
1112314491
1112276992
1090519040
1108073670
3220040313
3212836864
1112276992
1106973714
1161527296
1283873280
1275068482
3758096450
1464304192
3658575682
2147483839
671088831
2444885058
2046820417
462881093
16973
16972
4267524480
4098441747
49132
49024
3504685644
16522
4278207304
4289396650
4289396650
4289396650
4289396650
4290510779
4005298107
4181871710
3535791845
2499805184
2583691279
4279705
843022336
758329904
808267825
959520824
976761402
12851
16972
3987816608
16902
49024
1075200576
52969824
16973
16972
2741911920
2392539441
49112
49024
1450000932
16885
2063614970
4344978
4344832
3326148608
2034371548
12578282
12550144
306334720
4324116
4537216
1112311386
1112276992
1088421888
111301619

In [40]:
# Initialize a list to store the user data
users = []

# Open the binary file and read the data
with open('non_jpeg_challenge.bin', 'rb') as file:
    while True:
        # Read data from the file
        data = file.read(65)  # Assuming data is stored in 65-byte chunks (adjust as needed)

        # Break the loop if there's no more data to read
        if not data:
            break

        # Process the data based on the specified pattern
        # Here, we assume the data pattern you provided:
        user_data = [
            # Process the data based on the specified pattern

    data[0:19],  # 19-byte string (leave as bytes)
    data[19:29].decode('utf-8', 'replace').ljust(10),  # 10-byte string (decode as UTF-8 and pad)
    data[29:39].decode('utf-8', 'replace').ljust(10),  # 10-byte string (decode as UTF-8 and pad)
    int.from_bytes(data[39:42], byteorder='little'),  # 3-byte int
    data[42:47].decode('utf-8', 'replace').ljust(5),  # 5-byte string (decode as UTF-8 and pad)
    int.from_bytes(data[47:48], byteorder='little'),  # 1-byte int
    int.from_bytes(data[48:52], byteorder='little'),  # 4-byte int
    data[52:64].decode('utf-8', 'replace').ljust(12),  # 12-byte string (decode as UTF-8 and pad)
        ]

        # Append the user data to the list
        users.append(user_data)

# Now, the 'users' list contains the processed user data
# You can iterate through 'users' to further process or save the data as needed

for user in users:
    print(user)



[b'\xff\xaa\xff\xaa\xff\xaa\xff\xaa\xff\xaa\xff\xaa\xff\xaa\xff\xaa\xff\xbb\xff', '������]LB�', '���\x00\x00\x00�\x0f\x00 ', 15435008, 'AA\x00\x00\x00', 0, 858927154, '-10-08 19:28']
[b'31\x00\x00LB\x00\x00\xa0@\x85z\xf2A\x00\x00\x80\xbf\x00', '\x00@B��rA(\x03M', 'B\x00\x00LB\x00\x00pAn', 4272547, '��ؿ\x00 ', 0, 49024, '$BmV�A\x00\x00�C\x00{']
[b'LB\x00\x00LB\x00\x00\x00A\xc6\xdc\x0bBy\xea\xed\xbf\x00', '\x00��\x00\x00LB\x12\x14�', 'A\x00�;E\x00Z�LB', 4980736, 'B\x00\x00�@', 130, 1866618695, '\x11ڿ\x00\x00��\x00\x00(B� ']
[b'\x91A\x00\x00zE\x01\x97\x1bMB\x00\x00LB\x00\x00\x80A', ']�\x13BI��\x00\x00 ', '��\x00\x00LB�Њ@ ', 4718592, 'C\x00���', 170, 2868882175, '������������']
[b'\xbb\xff\xbb\xee^LB\xf9\xe5\xe6\xbf\xd2\x00\x00\x00\x95\x0f\x00\x00', '��MA\x00\x00�?20', '23-10-08 1', 3291705, '8:32\x00', 0, 16972, '�@��\x06B\x00\x00��\x00\x00']
[b'B\x16@`A(\x03MB\x00\x00LB\x00\x00pAn\xa3', '1A��ؿ\x00\x00�� ', '\x00\x00$BmV�A\x00\x00', 17402, '{�LB\x00', 0, 16972, '\x00A��\x0bBy���\x00\x00']

In [44]:
import struct
# Pack the data and write to a binary file
# '15s20s20s10si?' is our data structure - 15 characters (15s), 20 characters (20s), 
# 20 characters(20s), 10 characters(10s), integer (i), boolean (?)
# Notice how we have standardised the length of all the string data using ljust 

with open("non_jpeg_challenge", "wb") as file:
    for user in users:
        packed_data = struct.pack(
            '19s10s10s3s5s1i4s22s?',
            user[0],  # 19-byte string
            user[1],  # 10-byte string
            user[2],  # 10-byte string
            user[3].to_bytes(3, byteorder='little'),  # 3-byte int
            user[4],  # 5-byte string
            user[5].to_bytes(1, byteorder='little'),  # 1-byte int
            user[6].to_bytes(4, byteorder='little'),  # 4-byte int
            user[7].ljust(22),  # 12-byte string
        )
        file.write(packed_data)

error: pack expected 29 items for packing (got 8)

In [45]:
import struct

# Define the format string based on your pattern
format_string = '19s10s10s3s5s1s4s12s12s'
data_block_size = struct.calcsize(format_string)

# Open the binary file for reading
with open('non_jpeg_challenge.bin', 'rb') as file:
    # Loop through the data blocks
    for block_number in range(1, 89):  # Assuming you have 88 data blocks
        data = file.read(data_block_size)

        if len(data) < data_block_size:
            # Handle the case where you have fewer data blocks than expected
            break

        # Unpack the data based on the format string
        unpacked_data = struct.unpack(format_string, data)

        # Create a separate binary file for each data block
        output_filename = f'output_data_block_{block_number}.bin'

        with open(output_filename, 'wb') as output_file:
            # Pack the data and write to the output file
            packed_data = struct.pack(format_string, *unpacked_data)
            output_file.write(packed_data)

        print(f"Data from block {block_number} saved to '{output_filename}'")


Data from block 1 saved to 'output_data_block_1.bin'
Data from block 2 saved to 'output_data_block_2.bin'
Data from block 3 saved to 'output_data_block_3.bin'
Data from block 4 saved to 'output_data_block_4.bin'
Data from block 5 saved to 'output_data_block_5.bin'
Data from block 6 saved to 'output_data_block_6.bin'
Data from block 7 saved to 'output_data_block_7.bin'
Data from block 8 saved to 'output_data_block_8.bin'
Data from block 9 saved to 'output_data_block_9.bin'
Data from block 10 saved to 'output_data_block_10.bin'
Data from block 11 saved to 'output_data_block_11.bin'
Data from block 12 saved to 'output_data_block_12.bin'
Data from block 13 saved to 'output_data_block_13.bin'
Data from block 14 saved to 'output_data_block_14.bin'
Data from block 15 saved to 'output_data_block_15.bin'
Data from block 16 saved to 'output_data_block_16.bin'
Data from block 17 saved to 'output_data_block_17.bin'
Data from block 18 saved to 'output_data_block_18.bin'
Data from block 19 saved to 

In [51]:
import struct

# Define the format string based on your pattern
format_string = '19s10s10s3s5s1s4s12s12s'
data_block_size = struct.calcsize(format_string)

# Open the binary data file
with open('non_jpeg_challenge.bin', 'rb') as data_file:
    data = data_file.read()

# Calculate the number of data blocks
num_blocks = len(data) // data_block_size

# Unpack and save each data block
for block_number in range(num_blocks):
    # Unpack the data based on the format string for the current block
    offset = block_number * data_block_size
    block_data = data[offset:offset + data_block_size]
    unpacked_data = struct.unpack(format_string, block_data)

    # Create a separate text file for each data block
    output_filename = f'output_data_block_{block_number}.txt'

    with open(output_filename, 'w') as output_file:
        # Write each piece of data with visible separation
        for item in unpacked_data:
            if isinstance(item, bytes):
                try:
                    decoded_item = item.decode("utf-8")
                    output_file.write(f'{decoded_item}\n')
                except UnicodeDecodeError:
                    # Handle non-decodable binary data
                    output_file.write(f'Non-decodable binary data: {item.hex()}\n')
            else:
                output_file.write(f'{item}\n')

    print(f"Data from block {block_number} saved to '{output_filename}'")



Data from block 0 saved to 'output_data_block_0.txt'
Data from block 1 saved to 'output_data_block_1.txt'
Data from block 2 saved to 'output_data_block_2.txt'
Data from block 3 saved to 'output_data_block_3.txt'
Data from block 4 saved to 'output_data_block_4.txt'
Data from block 5 saved to 'output_data_block_5.txt'
Data from block 6 saved to 'output_data_block_6.txt'
Data from block 7 saved to 'output_data_block_7.txt'
Data from block 8 saved to 'output_data_block_8.txt'
Data from block 9 saved to 'output_data_block_9.txt'
Data from block 10 saved to 'output_data_block_10.txt'
Data from block 11 saved to 'output_data_block_11.txt'
Data from block 12 saved to 'output_data_block_12.txt'
Data from block 13 saved to 'output_data_block_13.txt'
Data from block 14 saved to 'output_data_block_14.txt'
Data from block 15 saved to 'output_data_block_15.txt'
Data from block 16 saved to 'output_data_block_16.txt'
Data from block 17 saved to 'output_data_block_17.txt'
Data from block 18 saved to 'o

UnicodeEncodeError: 'charmap' codec can't encode character '\u06bf' in position 1: character maps to <undefined>

In [52]:
import struct

# Define the format string based on your pattern
format_string = '19s10s10s3s5s1s4s12s12s'
data_block_size = struct.calcsize(format_string)

# Open the binary data file
with open('non_jpeg_challenge.bin', 'rb') as data_file:
    data = data_file.read()

# Calculate the number of data blocks based on the file size and block size
num_blocks = len(data) // data_block_size

# Unpack and save each data block
for block_number in range(num_blocks):
    # Unpack the data based on the format string for the current block
    offset = block_number * data_block_size
    block_data = data[offset:offset + data_block_size]
    unpacked_data = struct.unpack(format_string, block_data)

    # Create a separate text file for each data block
    output_filename = f'output_data_block_{block_number}.txt'

    with open(output_filename, 'w') as output_file:
        # Write each piece of data with visible separation
        for item in unpacked_data:
            if isinstance(item, bytes):
                try:
                    decoded_item = item.decode("utf-8")
                    output_file.write(f'{decoded_item}\n')
                except UnicodeDecodeError:
                    # Handle non-decodable binary data
                    output_file.write(f'Non-decodable binary data: {item.hex()}\n')
            else:
                output_file.write(f'{item}\n')

    print(f"Data from block {block_number} saved to '{output_filename}'")


Data from block 0 saved to 'output_data_block_0.txt'
Data from block 1 saved to 'output_data_block_1.txt'
Data from block 2 saved to 'output_data_block_2.txt'
Data from block 3 saved to 'output_data_block_3.txt'
Data from block 4 saved to 'output_data_block_4.txt'
Data from block 5 saved to 'output_data_block_5.txt'
Data from block 6 saved to 'output_data_block_6.txt'
Data from block 7 saved to 'output_data_block_7.txt'
Data from block 8 saved to 'output_data_block_8.txt'
Data from block 9 saved to 'output_data_block_9.txt'
Data from block 10 saved to 'output_data_block_10.txt'
Data from block 11 saved to 'output_data_block_11.txt'
Data from block 12 saved to 'output_data_block_12.txt'
Data from block 13 saved to 'output_data_block_13.txt'
Data from block 14 saved to 'output_data_block_14.txt'
Data from block 15 saved to 'output_data_block_15.txt'
Data from block 16 saved to 'output_data_block_16.txt'
Data from block 17 saved to 'output_data_block_17.txt'
Data from block 18 saved to 'o

UnicodeEncodeError: 'charmap' codec can't encode character '\u06bf' in position 1: character maps to <undefined>