### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Load Data and Feature Engineering:
Load the extracted data from the CSV file.

In [2]:
# Load the extracted data from CSV
df = pd.read_csv("cleaned.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,List A,List B
0,0,"[2929, 1727, 3056, 1801]","[714, 3826, 784, 4033]"
1,1,"[1970, 2461, 2028, 2654]","[5690, 2156, 5801, 2247]"
2,2,"[4026, 3674, 4138, 3781]","[1281, 1999, 1385, 2186]"
3,3,"[4027, 3083, 4106, 3211]","[1923, 2858, 2011, 2959]"
4,4,"[5693, 2332, 5777, 2435]","5693, 3398, 5778, 3511]"


In [4]:
df.columns

Index(['Unnamed: 0', 'List A', 'List B'], dtype='object')

In [5]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
df

Unnamed: 0,List A,List B
0,"[2929, 1727, 3056, 1801]","[714, 3826, 784, 4033]"
1,"[1970, 2461, 2028, 2654]","[5690, 2156, 5801, 2247]"
2,"[4026, 3674, 4138, 3781]","[1281, 1999, 1385, 2186]"
3,"[4027, 3083, 4106, 3211]","[1923, 2858, 2011, 2959]"
4,"[5693, 2332, 5777, 2435]","5693, 3398, 5778, 3511]"
...,...,...
2523,"[4134, 3279, 4356, 3408]","[3376, 1859, 3594, 1939]"
2524,"[3389, 1179, 3599, 1249]","[3382, 1574, 3594, 1640]"
2525,"[2384, 1571, 2620, 1640]","[1967, 1443, 2036, 1697]"
2526,"[4139, 3498, 4343, 3576]","[5802, 2513, 6034, 2642]"


In [7]:
# Function to add missing brackets to a list
def add_missing_brackets(lst):
    if lst.startswith('[') and not lst.endswith(']'):
        lst += ']'
    elif not lst.startswith('[') and lst.endswith(']'):
        lst = '[' + lst
    elif not lst.startswith('[') and not lst.endswith(']'):
        lst = '[' + lst + ']'
    return lst

# Apply the function to each value in the DataFrame

df['List A'] = df['List A'].apply(add_missing_brackets)
df['List B'] = df['List B'].apply(add_missing_brackets)

In [8]:
df

Unnamed: 0,List A,List B
0,"[2929, 1727, 3056, 1801]","[714, 3826, 784, 4033]"
1,"[1970, 2461, 2028, 2654]","[5690, 2156, 5801, 2247]"
2,"[4026, 3674, 4138, 3781]","[1281, 1999, 1385, 2186]"
3,"[4027, 3083, 4106, 3211]","[1923, 2858, 2011, 2959]"
4,"[5693, 2332, 5777, 2435]","[5693, 3398, 5778, 3511]"
...,...,...
2523,"[4134, 3279, 4356, 3408]","[3376, 1859, 3594, 1939]"
2524,"[3389, 1179, 3599, 1249]","[3382, 1574, 3594, 1640]"
2525,"[2384, 1571, 2620, 1640]","[1967, 1443, 2036, 1697]"
2526,"[4139, 3498, 4343, 3576]","[5802, 2513, 6034, 2642]"


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2528 entries, 0 to 2527
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   List A  2528 non-null   object
 1   List B  2528 non-null   object
dtypes: object(2)
memory usage: 39.6+ KB


In [10]:

import re


# Define a function to clean the data
def clean_data(s):
    return re.sub('[^0-9,\[\]\,]', '', s)

# Apply the function to the columns 'List A' and 'List B'
df['List A'] = df['List A'].apply(clean_data)
df['List B'] = df['List B'].apply(clean_data)

# # Save the cleaned DataFrame to a new CSV file
# df.to_csv('cleaned_file.csv', index=False)

In [15]:
df.to_csv('cleaned_file2.csv', index=False)

In [None]:
# df['List A'] = df['List A'].apply(lambda x: str(x).replace('(', '').replace(')', ''))
# df['List B'] = df['List B'].apply(lambda x: str(x).replace('(', '').replace(')', ''))

In [None]:
# df['List A'] = df['List A'].apply(lambda x: str(x).replace('{', '').replace('}', ''))
# df['List B'] = df['List B'].apply(lambda x: str(x).replace('{', '').replace('}', ''))

In [None]:
# df['List A'] = df['List A'].apply(lambda x: str(x).replace('|', ''))
# df['List B'] = df['List B'].apply(lambda x: str(x).replace('|', ''))

In [None]:
# df['List A'] = df['List A'].apply(lambda x: str(x).replace('\'', ''))
# df['List B'] = df['List B'].apply(lambda x: str(x).replace('\'', ''))

In [None]:
# df['List A'] = df['List A'].apply(lambda x: str(x).replace(':', ''))
# df['List B'] = df['List B'].apply(lambda x: str(x).replace(':', ''))

In [None]:
# import ast

# def convert_to_list(s):
#     try:
#         return ast.literal_eval(s)
#     except ValueError:
#         return s  # Return the original string if it can't be parsed

# # Assuming df is your DataFrame and 'column' is the name of your column



# df['List A'] = df['List A'].apply(convert_to_list)
# df['List B'] = df['List B'].apply(convert_to_list)

In [13]:
df.isna().sum()

List A    0
List B    0
dtype: int64

In [None]:
# df['List A'] = df['List A'].astype('object')
# df['List B'] = df['List B'].astype('object')

In [14]:
# import cv2

def compute_iou(box1, box2):
    # Compute intersection coordinates
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    # Compute area of intersection
    intersection_area = max(0, x2 - x1) * max(0, y2 - y1)

    # Compute area of union
    area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area_box2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union_area = area_box1 + area_box2 - intersection_area

    # Compute IoU
    iou = intersection_area / union_area

    return iou

def merge_related_boxes(list_a, list_b, threshold=0.5):
    merged_boxes = []
    for box_a in list_a:
        for box_b in list_b:
            if len(box_a) == 4 and len(box_b) == 4:  # Check if both boxes have valid coordinates
                if compute_iou(box_a, box_b) > threshold:
                    # Merge the boxes
                    merged_boxes.append([
                        min(box_a[0], box_b[0]),
                        min(box_a[1], box_b[1]),
                        max(box_a[2], box_b[2]),
                        max(box_a[3], box_b[3])
                    ])
    return merged_boxes

list_a = df['List A']
list_b = df['List B'] 
merged_result = merge_related_boxes(list_a, list_b)

print("Merged bounding boxes:", merged_result)


Merged bounding boxes: []


In [None]:

# We don't have a column "merge_decision" (1 for merge, 0 for not merge)
# createing a binary label based on our criteria
# For example, if overlap area > threshold, label as 1 (merge), else label as 0 (not merge)
# df["merge_decision"] = np.random.choice([0, 1], size=len(df))

In [None]:
# import ast

# df['List A'] = df['List A'].apply(ast.literal_eval)
# df['List B'] = df['List B'].apply(ast.literal_eval)

In [None]:
# # Split data into features (X) and target (y)
# X = df[["List A", "List B"]]  # Replace with actual features

# y = df["merge_decision"]  # Binary label (1 for merge, 0 for not merge)

# # Split into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # Standardize features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)


In [None]:
y_train

### Initialize and Train GMM:
Choose the number of components (clusters) for the GMM (e.g., n_components = 2).
Fit the GMM to the scaled training data.

In [None]:
# Initialize and train a Gaussian Mixture Model (GMM)
n_components = 2  # Number of components (clusters)
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(X_train)
