In [1]:
# !export FIFTYONE_DEFAULT_APP_ADDRESS=cadmium
# !export FIFTYONE_DATABASE_URI=mongodb://cadmium:27017

# mongodb://[username:password@]host[:port]

In [3]:
import fiftyone as fo
import pandas as pd
import numpy as np
import math 
import fiftyone.brain as fob
from fiftyone import ViewField as F
import fiftyone.zoo as foz
import cv2

In [4]:
#!pip install "notebook>=5.3" "ipywidgets>=7.5"

# Notebook to split the dataset using different approaches 
See https://esmartcontrol.atlassian.net/wiki/spaces/ES/pages/2342420485/YoloV5+Multitasks#Data-splits

In [5]:
dataset = fo.load_dataset("esmart_wip")

In [6]:
fo.launch_app(dataset)

Dataset:          esmart_wip
Media type:       image
Num samples:      6725
Selected samples: 0
Selected labels:  0
Session URL:      http://localhost:5151/

In [78]:
def analyze_sequences_fiftyone():
    """Analyze the different sequences in the 51 dataset 
    Returns a df with a count of frames per sequence.
    """
    filepath_list = []
    seq_list = []
    frame_list = []
    for sample in dataset:
        file_path = sample.filepath
        file_name = sample.filepath.split("/")[-1]
#         print(file_name)
        file_name_splitted = file_name.split("_")
#         print(file_name_splitted)
        sequence_ls = file_name_splitted[:-1]
        sequence = "".join(map(str, sequence_ls))
        frame = file_name_splitted[-1].split(".")[:-1]
        filepath_list.append(file_path)
        seq_list.append(sequence)
        frame_list.append(frame)
#         break
    sequences_tuples = list(zip(filepath_list, seq_list, frame_list))
    sequences_df = pd.DataFrame(sequences_tuples, columns=["Filepath", "Sequence", "Frame"])
#     sequences_df.sort_values(["Sequence", "Frame"], inplace=True)
#     print(sequences_df.groupby(["Sequence"]).count().shape)
# convert the frame id from a list of string to an integer
    sequences_df['Frame'] = sequences_df['Frame'].apply(lambda x: int(x[0]))
    sequences_df_grouped = sequences_df.groupby(["Sequence"]).count()
    return sequences_df

In [79]:
sequences_df = analyze_sequences_fiftyone()

In [80]:
sequences_df

Unnamed: 0,Filepath,Sequence,Frame
0,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11004
1,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11010
2,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11232
3,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11856
4,/home/admin/data/esmart_wip/20210629_173500_12...,20210629173500,12156
...,...,...,...
6720,/home/raphael/esmart/esmart-ai-datasets/data/e...,20210630175841,248
6721,/home/raphael/esmart/esmart-ai-datasets/data/e...,20210630175841,290
6722,/home/selim/Desktop/esmart-ai-datasets/data/es...,Log-20220318-064528 Data Log,248
6723,/home/selim/Desktop/esmart-ai-datasets/data/es...,Log-20220318-064528 Data Log,296


## 1. Split manually by selecting some validation sequences -- OOD

In [None]:
def split_train_val_fiftyone_sequences():
    """
    Split the dataset by sequence number (by manually inspecting the properties of the sequences).
    Allows to not have frames from the same sequence in both the training and validation set.
    This is crucial to check whether the algorithm is performant in "new scenes" and doesn't 
    just learns spurrious correlations. 
    """
    list_sequences_val_set = [
        "Log-20220123-124034 Data Log",  # for some snowy and/or wet
        "Log-20220130-104657 Data Log",  # for some snowy and/or wet
        "Log-20220120-181824 Data Log",  # for some snowy and/or wet
        "20210630191429",  # for some dry
        "20210630180005",  # for some dry
        "20210630192517",  # for some dry
        "20211129193841",  # for some dry
        "20210715091801",  # for some dry
        "20211129205422",  # for some dry
        "20210724133321",  # for some dry
        "20210715175810",  # for some dry
        "20210630195611",  # for some dry
        "20210630191006",  # for some dry
    ]

    counter_train = {}
    counter_train["wet"] = 0
    counter_train["dry"] = 0
    counter_train["snowy"] = 0

    counter_val = {}
    counter_val["wet"] = 0
    counter_val["dry"] = 0
    counter_val["snowy"] = 0

    for sample in dataset:
        try:
            condition = sample.gt_road_condition.classifications[0].label
            seq = str(sample.tags[-1])
            if seq in list_sequences_val_set:
                counter_val[condition] += 1
                sample.tags.append("VAL_ROAD_COND")
                sample.save()
            else:
                condition = sample.gt_road_condition.classifications[0].label
                counter_train[condition] += 1
                sample.tags.append("TRAIN_ROAD_COND")
                sample.save()
        except:
            a = 1
    print("TRAIN ", counter_train)
    print("train total", sum(counter_train.values()))
    print("VAL ", counter_val)
    print("val total", sum(counter_val.values()))

if SPLIT_TRAIN_VAL_IN_51:
    split_train_val_fiftyone_sequences()

## 2. Use Subsequences for splitting -- ID

In [193]:
def check_time_between_2_frames(frame_1, frame_2, num_minutes):
    """
    Function to check whether 2 frames can be used in 2 data sets.
    e.g We want frame_1 in the training set and frame_2 in the validation set. We need to make 
    sure that these are not too close to each other in terms of time otherwise they could be very
    similar and the validation set would be biased. This is not the perfect technique to check the
    independence between frames but that's an approach.
    args: 
        the frame numbers from a same sequence 
        the number of minutes that we want between 2 frames to consider them as "far enough"
            from eaach other (independent enough).
    returns:
        True if we can use these 2 images in 2 sets
        False otherwise
    """
    assert type(frame_1) == type(frame_2) == np.int64
    num_frame_btw = abs(frame_2 - frame_1)
    # approximation
    threshold = num_minutes * 60 * 24
#     print("The chosen time as threshold is", num_minutes,"minutes.")
#     print("There must be at least", threshold,"frames between 2 images to split these in 2 sets.")
    if num_frame_btw >= threshold:
        return True 
    else:
        return False 

In [194]:
# get all the different sequence names
sequences_names = sequences_df['Sequence'].unique()
print("There is", len(sequences_names), "sequences.")

There is 53 sequences.


In [230]:
num_minutes_threshold = 2

empty_list = list(zip([],[],[],[]))
frames_splitted_df = pd.DataFrame(empty_list, columns=["Filepath", "Sequence", "Frame","Set"])

frames_splitted_df

for seq in sequences_names:
    # loc all the rows of a specific sequence
    sequence = sequences_df.loc[sequences_df['Sequence'] == seq].sort_values(['Frame'], 
                                                                             ignore_index=True)
    num_frames = sequence.shape[0]
    print("The sequence", seq, "contains", num_frames, "frames.")
    max_index_training_set = math.floor(num_frames * 0.7)-1  # 70 % of the frames in this data 
    sequence['Set'] = 0
    sequence.loc[:max_index_training_set, 'Set'] = 'train'
    found_first_index_val_set = False
    i = 1
    frame_1 = sequence['Frame'][max_index_training_set]
    print("The maximum index for the train set is", max_index_training_set,". It's the frame", frame_1)
    # print("Frame 1:",frame_1)
    index = max_index_training_set + 1
    while not found_first_index_val_set and index < num_frames:
        frame_2 = sequence['Frame'][index]
        found_first_index_val_set = check_time_between_2_frames(frame_1, frame_2, num_minutes_threshold)
        if found_first_index_val_set:
            min_index_val_set = index
            print("The minimum index for the validation set is", min_index_val_set,". It's the frame", frame_2)
            sequence.loc[min_index_val_set:, 'Set'] = 'val'
            break
        i += 1
        index += i
    if not found_first_index_val_set:
        print("No minimum index found. Try reducing the percentage of data in the training set or the time threshold.")
    frames_to_concat = [frames_splitted_df, sequence]
    frames_splitted_df = pd.concat(frames_to_concat)
    print()

The sequence 20210629173500 contains 103 frames.
The maximum index for the train set is 71 . It's the frame 8442
The minimum index for the validation set is 99 . It's the frame 13326

The sequence 20210629174553 contains 114 frames.
The maximum index for the train set is 78 . It's the frame 17061
The minimum index for the validation set is 99 . It's the frame 24183

The sequence 20210630055611 contains 226 frames.
The maximum index for the train set is 157 . It's the frame 17304
The minimum index for the validation set is 172 . It's the frame 20943

The sequence 20210630083253 contains 265 frames.
The maximum index for the train set is 184 . It's the frame 36222
The minimum index for the validation set is 212 . It's the frame 40755

The sequence 20210630175841 contains 3 frames.
The maximum index for the train set is 1 . It's the frame 570
No minimum index found. Try reducing the percentage of data in the training set or the time threshold.

The sequence 20210630180005 contains 27 fram

In [231]:
assert frames_splitted_df.shape[0] == sequences_df.shape[0]

In [279]:
frames_splitted_df

Unnamed: 0,Filepath,Sequence,Frame,Set
0,/home/admin/data/esmart_wip/20210629_173500_20...,20210629173500,2034,train
1,/home/admin/data/esmart_wip/20210629_173500_20...,20210629173500,2064,train
2,/home/admin/data/esmart_wip/20210629_173500_21...,20210629173500,2100,train
3,/home/admin/data/esmart_wip/20210629_173500_22...,20210629173500,2214,train
4,/home/admin/data/esmart_wip/20210629_173500_22...,20210629173500,2256,train
...,...,...,...,...
69,/home/raphael/esmart/esmart-ai-datasets/data/e...,Log-20220121-143049 Data Log,9656,0
70,/home/raphael/esmart/esmart-ai-datasets/data/e...,Log-20220121-143049 Data Log,9749,0
71,/home/raphael/esmart/esmart-ai-datasets/data/e...,Log-20220121-143049 Data Log,9873,0
72,/home/raphael/esmart/esmart-ai-datasets/data/e...,Log-20220121-143049 Data Log,10214,0


In [266]:
count_split = frames_splitted_df['Set'].value_counts()
df_count = pd.DataFrame(count_split).transpose()
df_count.rename(columns={'train':'Training set', 'val':'Validation set', 0:'Left out'}, inplace=True)

In [275]:
df_count

Unnamed: 0,Training set,Validation set,Left out
Set,4451,1156,785


In [59]:
import plotly.express as px
fig = px.bar(df_count, title='Distribution of the new data split')
fig.show()

ValueError: DataFrame constructor not properly called!

### Save splits in fiftyone

In [323]:
# training/validation road condition automatic based on time 
mapping = {'train':'TRAIN_RC_AUTO_TIME', 'val':'VAL_RC_AUTO_TIME'} 
for sample in dataset:
    data_set = str(list(frames_splitted_df[frames_splitted_df['Filepath'] == sample.filepath]['Set'])[0])
#     print(data_set)
    if data_set != '0':
        sample.tags.append(mapping[data_set])
#         print(sample.tags)
        sample.save()

### Delete splits in fiftyone (if needed sometimes)

In [322]:
to_check = list(mapping.values())
for sample in dataset:
    for item in to_check:
        if item in sample.tags:
            sample.tags.remove(str(item))
    sample.save()

['COCO', 'TRAIN_ROAD_COND', 'TRAINING']
VAL_RC_AUTO_TIME
['COCO', 'TRAIN_ROAD_COND', 'TRAINING']


## 7. Split by location - OOD

In [14]:
dataset = fo.load_dataset("esmart_wip")

session = fo.launch_app(dataset)

# A list of ``[longitude, latitude]`` coordinates
locations = dataset.values("location.point.coordinates")
# locations = list(filter(None, locations))  # remove None locations from the list
# print(len(locations))
# print(locations)

# Scalar `uniqueness` values for each sample
ids_list = dataset.values('id')
ids = np.array(dataset.values('id'))
unique = np.unique(ids)
num_values = len(unique)
# print(num_values)

# # The number of ground truth objects in each sample
num_objects = dataset.values(F("ground_truth.detections").length())
# print(len(num_objects))

# assert len(locations) == len(num_objects) == num_values


data = list(zip(locations, ids_list, num_objects))
data_df = pd.DataFrame(data, columns=["location", "id", "num_object"])
data_df.dropna(inplace=True)
locations = list(data_df['location'])
ids = list(data_df['id'])
num_objects = list(data_df['num_object'])


# # Create scatterplot
plot = fo.location_scatterplot(
    locations=locations,
    labels=ids,      # color points by their `uniqueness` values
    sizes=num_objects,      # scale point sizes by number of objects
    labels_title="uniqueness",
    sizes_title="objects",
)
plot.show(height=720)
#session.view()
#session.plots.attach(plot)


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds







FigureWidget({
    'data': [{'hovertemplate': ('<b>uniqueness: %{text}</b><br>' ... 'lon: %{lon:.5f}<extra></e…

ValueError: Plots must be subclasses of <class 'fiftyone.core.plots.base.ResponsivePlot'>; but found <class 'fiftyone.core.plots.plotly.PlotlyNotebookPlot'>. You may be working in an environment that does not support interactivity.

See https://voxel51.com/docs/fiftyone/user_guide/plots.html#overview for more information

In [5]:
#!jupyter nbextension enable --py widgetsnbextension --sys-prefix

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [None]:
# WIP

## 3. Shuffle the images from all sequences and split by images (randomly) + apply similarity threshold for include/exclude in the val set — ID

### Compute the embeddings of the train set 

In [None]:
train_set = dataset.match_tags(".....")
val_set = dataset.match_tags(".....")

In [17]:
# The BDD dataset must be manually downloaded. See the zoo docs for details
#source_dir = "/path/to/dir-with-bdd100k-files"

# Load dataset
#dataset = foz.load_zoo_dataset(
#    "bdd100k", split="validation", source_dir=source_dir,
#)

# Compute embeddings
# You will likely want to run this on a machine with GPU, as this requires
# running inference on 10,000 images
model = foz.load_zoo_model("mobilenet-v2-imagenet-torch")
embeddings = train_set.compute_embeddings(model)

# Compute visualization
results = fob.compute_visualization(train_set, embeddings=embeddings, seed=51)

# Launch App instance
session = fo.launch_app(dataset)

Downloading model from 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth'...
 100% |████|  108.4Mb/108.4Mb [5.2s elapsed, 0s remaining, 47.9Mb/s]      



The parameter 'pretrained' is deprecated since 0.13 and will be removed in 0.15, please use 'weights' instead.


Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MobileNet_V2_Weights.IMAGENET1K_V1`. You can also use `weights=MobileNet_V2_Weights.DEFAULT` to get the most up-to-date weights.

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /Users/selimgilon/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth


  0%|          | 0.00/13.6M [00:00<?, ?B/s]

In [None]:
# Generate scatterplot
plot = results.visualize(
    labels="timeofday.label",
    labels_title="time of day",
    axis_equal=True,
)
plot.show(height=512)

# Connect to session
session.plots.attach(plot)

In [None]:
# Index images by similarity
fob.compute_similarity(train_set, brain_key="image_sim")

# Launch App
session = fo.launch_app(train_set)

# In the App... select some image(s) and use the similarity menu to sort!

In [None]:
# Choose a random image from the dataset
query_id = dataset.take(1).first().id

# Programmatically construct a view containing the 5 most similar images
view = dataset.sort_by_similarity(query_id, k=5, brain_key="image_sim")

# View results in App
session.view = view

In [19]:
#######

In [15]:
# Construct a `num_samples x num_pixels` array of images
images = np.array([
    cv2.imread(f, cv2.IMREAD_UNCHANGED).ravel()
    for f in dataset.values("filepath")
])

# Compute 2D embeddings
results = fob.compute_visualization(dataset, embeddings=images, seed=51)

# Visualize embeddings, colored by ground truth label
plot = results.visualize(labels="ground_truth.label")
plot.show(height=720)

[ WARN:0@935.948] global /Users/xperience/actions-runner/_work/opencv-python/opencv-python/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('/home/admin/data/esmart_wip/20210629_173500_11004.jpg'): can't open/read file: check file path/integrity


AttributeError: 'NoneType' object has no attribute 'ravel'

### Find possible annotation mistakes 

In [None]:
dataset = fo.load_dataset(...)

# Get samples for which we added predictions
h_view = dataset.match_tags("....")

# Compute mistakenness
fob.compute_mistakenness(h_view, model_name, label_field="ground_truth", use_logits=True)

# Sort by likelihood of mistake (most likely first)
mistake_view = (dataset
    .match_tags("processed")
    .sort_by("mistakenness", reverse=True)
)

# Print some information about the view
print(mistake_view)

# Inspect the first few samples
print(mistake_view.head())

# Show the samples we processed in rank order by the mistakenness
session.view = mistake_view

session.freeze() # screenshot the active App for sharing

### Compute hardness of samples

In [None]:
dataset = fo.load_dataset(...)

fob.compute_hardness(dataset, "predictions")

In [22]:
# WIP

## 5. Shuffle the images from all sequences and split by images (randomly) — fully – ID

In [81]:
#sequences_df.to_csv('sequences.csv', header=None)
random_set_df = sequences_df
random_set_df.head()

Unnamed: 0,Filepath,Sequence,Frame
0,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11004
1,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11010
2,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11232
3,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11856
4,/home/admin/data/esmart_wip/20210629_173500_12...,20210629173500,12156


In [82]:
rand_val_set = random_set_df.sample(frac=0.2, replace=False)  # random frames in validation set using a %
rand_val_set['Set'] = 'val'

In [83]:
rand_train_set = random_set_df.loc[~random_set_df.index.isin(rand_val_set.index)]
rand_train_set['Set'] = 'train'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [84]:
print(rand_val_set.shape)
rand_val_set.head()

(1345, 4)


Unnamed: 0,Filepath,Sequence,Frame,Set
2521,/home/admin/data/esmart_wip/20210724_141043_18...,20210724141043,18864,val
4933,/home/admin/data/esmart_wip/run_9_8026.jpg,run9,8026,val
1464,/home/admin/data/esmart_wip/20210711_110911_43...,20210711110911,43656,val
1885,/home/admin/data/esmart_wip/20210715_085351_16...,20210715085351,16356,val
61,/home/admin/data/esmart_wip/20210629_173500_70...,20210629173500,7098,val


In [85]:
print(rand_train_set.shape)
rand_train_set.head()

(5380, 4)


Unnamed: 0,Filepath,Sequence,Frame,Set
0,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11004,train
1,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11010,train
3,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11856,train
5,/home/admin/data/esmart_wip/20210629_173500_12...,20210629173500,12480,train
6,/home/admin/data/esmart_wip/20210629_173500_13...,20210629173500,13140,train


In [86]:
frames_to_concat = [rand_train_set, rand_val_set]
random_split = pd.concat(frames_to_concat)
random_split

Unnamed: 0,Filepath,Sequence,Frame,Set
0,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11004,train
1,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11010,train
3,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11856,train
5,/home/admin/data/esmart_wip/20210629_173500_12...,20210629173500,12480,train
6,/home/admin/data/esmart_wip/20210629_173500_13...,20210629173500,13140,train
...,...,...,...,...
2108,/home/admin/data/esmart_wip/20210715_175810_18...,20210715175810,180,val
2480,/home/admin/data/esmart_wip/20210724_141043_11...,20210724141043,11790,val
62,/home/admin/data/esmart_wip/20210629_173500_74...,20210629173500,7446,val
2191,/home/admin/data/esmart_wip/20210724_113301_18...,20210724113301,18102,val


In [87]:
count_split = random_split['Set'].value_counts()
df_count = pd.DataFrame(count_split).transpose()
df_count

Unnamed: 0,train,val
Set,5380,1345


In [88]:
fig = px.bar(df_count, title='Distribution of the new data split')
fig.show()

## 6. Split Automatically (computer) by randomly selecting some validation sequences -- OOD 

In [89]:
sequences_df.head()

Unnamed: 0,Filepath,Sequence,Frame
0,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11004
1,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11010
2,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11232
3,/home/admin/data/esmart_wip/20210629_173500_11...,20210629173500,11856
4,/home/admin/data/esmart_wip/20210629_173500_12...,20210629173500,12156


In [90]:
rand_seq_split_df = sequences_df

In [117]:
seq_df = rand_seq_split_df.groupby('Sequence').count()
seq_df.head()

Unnamed: 0_level_0,Filepath,Frame
Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1
20210629173500,103,103
20210629174553,114,114
20210630055611,226,226
20210630083253,265,265
20210630175841,7,7


In [113]:
val_seq_df = seq_df.sample(frac=0.2, replace=False)  
val_seq_df['Set'] = 'val'
print("There are", val_seq_df.shape[0], "sequences and",val_seq_df['Filepath'].sum(), "frames in the validation set.")

There are 12 sequences and  1870 frames in the validation set.


In [114]:
train_seq_df = seq_df.loc[~seq_df.index.isin(val_seq_df.index)]
train_seq_df['Set'] = 'train'
print("There are", train_seq_df.shape[0], "sequences and", train_seq_df['Filepath'].sum(), "frames in the training set.")

There are 47 sequences and 4855 frames in the training set.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [116]:
frames_to_concat = [train_seq_df, val_seq_df]
random_split_seq = pd.concat(frames_to_concat)
random_split_seq.head()

Unnamed: 0_level_0,Filepath,Frame,Set
Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20210629173500,103,103,train
20210629174553,114,114,train
20210630055611,226,226,train
20210630083253,265,265,train
20210630175841,7,7,train


In [118]:
# TODO: check the classes distribution in these random splits

### Save splits in Fiftyone

In [None]:
# training/validation road condition on random sequence selection
mapping = {'train':'TRAIN_RC_RND_SEQ', 'val':'VAL_RC_RND_SEQ'} 
for sample in dataset:
    data_set = str(list(random_split_seq['Filepath'] == sample.filepath]['Set'])[0])
#     print(data_set)
    if data_set != '0':
        sample.tags.append(mapping[data_set])
#         print(sample.tags)
        # sample.save()
    break

### Delete splits in fiftyone (if needed sometimes)

In [None]:
to_check = list(mapping.values())
for sample in dataset:
    for item in to_check:
        if item in sample.tags:
            sample.tags.remove(str(item))
    sample.save()

## 4. Use the sequences from the same location but under different road conditions in the validation set -- ID

In [None]:
# TODO 