In [3]:
### Import required libraries

import pandas as pd
import numpy as np
import os
import shutil
import zipfile

import warnings
warnings.filterwarnings("ignore")

In [4]:
# Set directory to read the image files

os.chdir(r"C:\Applied_Machine_Learning\Project\Data")

## Data Ingestion - 2021 & 2022

In [5]:
def data_ingestion():
   
    def data_2021():
        # Load the labels for 2021 data
        train_data_21 = pd.read_csv('2021/label/train_labels.csv')
        val_data_21 = pd.read_csv('2021/label/validate_labels.csv')
        test_data_21 = pd.read_csv('2021/label/test_labels.csv')
        
        train_data_21 = train_data_21[['image','class','bbox']]
        val_data_21 = val_data_21[['image','class','bbox']]
        test_data_21 = test_data_21[['image','class','bbox']]

        """
        train_data_21.rename(columns = {"image":"filename"}, inplace = True)
        val_data_21.rename(columns = {"image":"filename"}, inplace = True)
        test_data_21.rename(columns = {"image":"filename"}, inplace = True)
        """
        
        return train_data_21, val_data_21, test_data_21
    
    def data_2022():
        # Load the labels for 2022 data
        train_data_22 = pd.read_csv('2022/label/train.csv')
        val_data_22 = pd.read_csv('2022/label/val.csv')
        test_data_22 = pd.read_csv('2022/label/sample_submission.csv')

        return train_data_22, val_data_22, test_data_22
    
    train_data_21, val_data_21, test_data_21 = data_2021()
    train_data_22, val_data_22, test_data_22 = data_2022()

    """
    pd.set_option('display.max.columns', None)
    
    # Function  to display all the label df
    def display_dataframes(dataframes, titles):
        for df, title in zip(dataframes, titles):
            print(title)
            display(df)

    display_dataframes(
        [train_data_21, val_data_21, test_data_21, train_data_22, val_data_22],
        ["Training Data 2021", "Validation Data 2021", "Test Data 2021", "Training Data 2022", "Validation Data 2022"]
    )
    """
    
    return train_data_21, val_data_21, test_data_21, train_data_22, val_data_22, test_data_22

## Data Preparation - 2021

In [6]:
train_data_21, val_data_21, test_data_21, train_data_22, val_data_22, test_data_22 = data_ingestion()
display(train_data_21)
display(val_data_21)
display(test_data_21)

Unnamed: 0,image,class,bbox
0,image_10299_img.png,CubeSat,"[296, 536, 353, 581]"
1,image_12016_img.png,Jason,"[580, 434, 712, 753]"
2,image_06332_img.png,Debris,"[437, 406, 503, 463]"
3,image_02173_img.png,TRMM,"[0, 151, 312, 538]"
4,image_11207_img.png,Cloudsat,"[450, 425, 509, 481]"
...,...,...,...
89995,image_24362_img.png,Debris,"[447, 440, 501, 478]"
89996,image_08834_img.png,Sentinel-6,"[291, 209, 485, 407]"
89997,image_00226_img.png,Cloudsat,"[506, 472, 556, 519]"
89998,image_11172_img.png,Jason,"[422, 438, 515, 631]"


Unnamed: 0,image,class,bbox
0,image_10563_img.png,Cloudsat,"[262, 283, 323, 367]"
1,image_06536_img.png,Terra,"[476, 469, 542, 497]"
2,image_04676_img.png,Sentinel-6,"[520, 445, 584, 516]"
3,image_02965_img.png,Sentinel-6,"[377, 606, 692, 945]"
4,image_03907_img.png,CubeSat,"[523, 421, 560, 464]"
...,...,...,...
29995,image_04350_img.png,CubeSat,"[714, 573, 761, 606]"
29996,image_07779_img.png,CubeSat,"[360, 657, 381, 681]"
29997,image_05426_img.png,CubeSat,"[537, 737, 567, 783]"
29998,image_06952_img.png,Cloudsat,"[420, 402, 466, 449]"


Unnamed: 0,image,class,bbox
0,image_00000_img.png,1,"[457, 524, 684, 733]"
1,image_00001_img.png,6,"[58, 193, 299, 495]"
2,image_00002_img.png,0,"[411, 406, 491, 490]"
3,image_00003_img.png,7,"[346, 640, 763, 892]"
4,image_00004_img.png,9,"[436, 574, 734, 842]"
...,...,...,...
29995,image_29995_img.png,9,"[242, 404, 460, 515]"
29996,image_29996_img.png,4,"[445, 358, 541, 412]"
29997,image_29997_img.png,3,"[715, 408, 1024, 560]"
29998,image_29998_img.png,5,"[246, 594, 311, 667]"


## Handling Class Imbalancement - 2021 & 2022

In [7]:
#def data_preparation():
def valueCounts(data, lst):
    for col in lst:
        print(col,'\n')
        print(data[col].value_counts())
        print('\n')
"""
valueCounts(train_data_21, ['class'])
valueCounts(val_data_21, ['class'])
valueCounts(test_data_21, ['class'])
"""

# In the test dataset, class 6 has been identified as debris.

order_2021 = {'CubeSat': 1, 'Jason': 2, 'TRMM': 3, 'Cloudsat': 4, 'Terra': 5, 'Debris': 6, 
         'Sentinel-6': 7, 'AcrimSat': 8, 'Aura': 9, 'Aquarius': 10, 'Calipso': 11}


train_data_21['class_copy'] = train_data_21['class'].copy()
val_data_21['class_copy'] = val_data_21['class'].copy()

# Ordinal encoding - classes
train_data_21['class_copy'] = train_data_21['class_copy'].map(order_2021)
val_data_21['class_copy'] = val_data_21['class_copy'].map(order_2021)

def imbalancement_handling(data, debris_count, satellite_count, year):
    # Adjust class name for case sensitivity based on the year
    debris_class_name = 'Debris' if year == 2021 else 'debris'
    
    debris_df = data[data['class'].str.lower() == debris_class_name.lower()].sample(n=debris_count, random_state=102)
    
    # Sample other classes
    other_classes_df = pd.DataFrame()
    for satellite_class in data['class'].unique():
        if satellite_class.lower() != debris_class_name.lower():
            sampled_df = data[data['class'] == satellite_class].sample(n=satellite_count, random_state=102)
            other_classes_df = pd.concat([other_classes_df, sampled_df], ignore_index=True)
    
    # Combine and reset index
    balanced_df = pd.concat([debris_df, other_classes_df], ignore_index=True)
    
    # Label encoding for debris (0) and satellites (1)
    balanced_df['class_copy'] = balanced_df['class'].apply(lambda x: 0 if x.lower() == debris_class_name.lower() else 1)
    
    return balanced_df

train_data_21 = imbalancement_handling(train_data_21, 15000, 1500, 2021)
val_data_21 = imbalancement_handling(val_data_21, 5000, 500, 2021)

train_data_22 = imbalancement_handling(train_data_22, 6000, 600, 2022)
val_data_22 = imbalancement_handling(val_data_22, 2000, 200, 2022)

test_data_21['class'] = test_data_21['class'].apply(lambda x: 0 if x == 6 else 1)

display(train_data_21)
display(val_data_21)
display(test_data_21)
display(train_data_22)
display(val_data_22)

Unnamed: 0,image,class,bbox,class_copy
0,image_22142_img.png,Debris,"[562, 488, 616, 547]",0
1,image_04520_img.png,Debris,"[529, 552, 545, 567]",0
2,image_19428_img.png,Debris,"[747, 668, 881, 818]",0
3,image_17605_img.png,Debris,"[194, 454, 337, 582]",0
4,image_14818_img.png,Debris,"[451, 620, 488, 683]",0
...,...,...,...,...
29995,image_10510_img.png,Calipso,"[0, 726, 248, 964]",1
29996,image_11535_img.png,Calipso,"[613, 362, 681, 460]",1
29997,image_09127_img.png,Calipso,"[444, 350, 479, 435]",1
29998,image_09094_img.png,Calipso,"[258, 145, 334, 373]",1


Unnamed: 0,image,class,bbox,class_copy
0,image_13657_img.png,Debris,"[587, 483, 645, 556]",0
1,image_16869_img.png,Debris,"[569, 404, 601, 452]",0
2,image_19057_img.png,Debris,"[450, 502, 469, 547]",0
3,image_08279_img.png,Debris,"[390, 399, 531, 537]",0
4,image_06653_img.png,Debris,"[569, 442, 617, 495]",0
...,...,...,...,...
9995,image_02440_img.png,Jason,"[570, 564, 605, 673]",1
9996,image_11697_img.png,Jason,"[217, 393, 535, 749]",1
9997,image_02384_img.png,Jason,"[656, 302, 704, 438]",1
9998,image_09530_img.png,Jason,"[447, 603, 667, 768]",1


Unnamed: 0,image,class,bbox
0,image_00000_img.png,1,"[457, 524, 684, 733]"
1,image_00001_img.png,0,"[58, 193, 299, 495]"
2,image_00002_img.png,1,"[411, 406, 491, 490]"
3,image_00003_img.png,1,"[346, 640, 763, 892]"
4,image_00004_img.png,1,"[436, 574, 734, 842]"
...,...,...,...
29995,image_29995_img.png,1,"[242, 404, 460, 515]"
29996,image_29996_img.png,1,"[445, 358, 541, 412]"
29997,image_29997_img.png,1,"[715, 408, 1024, 560]"
29998,image_29998_img.png,1,"[246, 594, 311, 667]"


Unnamed: 0,filename,class,bbox,class_copy
0,img020297.png,debris,"[480, 643, 650, 771]",0
1,img025277.png,debris,"[663, 218, 756, 283]",0
2,img025098.png,debris,"[770, 605, 843, 674]",0
3,img025621.png,debris,"[805, 309, 870, 380]",0
4,img020295.png,debris,"[663, 534, 857, 719]",0
...,...,...,...,...
11995,img034008.png,double_star,"[27, 375, 536, 608]",1
11996,img036956.png,double_star,"[447, 508, 541, 651]",1
11997,img030150.png,double_star,"[187, 166, 864, 916]",1
11998,img036704.png,double_star,"[390, 528, 685, 658]",1


Unnamed: 0,filename,class,bbox,class_copy
0,img021146.png,debris,"[539, 229, 685, 391]",0
1,img022562.png,debris,"[471, 261, 531, 340]",0
2,img027925.png,debris,"[511, 614, 585, 681]",0
3,img027323.png,debris,"[441, 687, 518, 748]",0
4,img028632.png,debris,"[789, 462, 859, 521]",0
...,...,...,...,...
3995,img033343.png,double_star,"[420, 153, 684, 611]",1
3996,img039462.png,double_star,"[476, 306, 625, 572]",1
3997,img037319.png,double_star,"[417, 425, 624, 759]",1
3998,img039601.png,double_star,"[538, 386, 692, 589]",1


## Data Preparation - 2021

In [8]:
# Appending satellite name and year to original image name, changing file extension and relocating it
def image_relocalize(data, tag):    
    data['image_c'] = data['image'].str[:-7]
    data['image'] = data['image'].str[:-4]
    data['image'] = data['image'] + '.jpg'
      
    if tag in['train', 'validate']:
        # counters for checking non-existing images
        found_count = 0
        not_found_count = 0

        for clas in data['class'].unique().tolist():
            data.loc[data['class'] == clas,'image_c'] = data[data['class'] == clas]['image_c'] + clas + '_2021.jpg'
            for index, row in data[data['class'] == clas].iterrows():
                
                actual_image_name = row['image']
                changing_image_name = row['image_c']

                original_image_path = os.path.join('2021/' + tag + '_rgb/' + clas, actual_image_name)
                destination_image_path = os.path.join('master_data/' + tag, changing_image_name)
        
                try:
                    # Copy the image file to another location
                    shutil.copy(original_image_path, destination_image_path)
                    found_count += 1
                except FileNotFoundError:
                    not_found_count += 1
                    data.drop([index], inplace = True, axis = 0)         

        print(tag, f"Files found: {found_count}")
        print(tag, f"Files not found: {not_found_count}")

    elif tag == 'test':
        # counters for checking non-existing images
        found_count = 0
        not_found_count = 0

        data.loc[:,'image_c'] = data['image_c'] + '2021.jpg'
        for index, row in data.iterrows():
            
            actual_image_name = row['image']
            changing_image_name = row['image_c']

            original_image_path = os.path.join('2021/test_rgb', actual_image_name)
            destination_image_path = os.path.join('master_data/test', changing_image_name)
    
            try:
                # Copy the image file to another location
                shutil.copy(original_image_path, destination_image_path)
                found_count += 1
            except FileNotFoundError:
                not_found_count += 1
                data.drop([index], inplace = True, axis = 0)
            
        print(tag, f"Files found: {found_count}")
        print(tag, f"Files not found: {not_found_count}")
    
    else:
        print("Invalid Tag")
    
    return data

train_data_21 = image_relocalize(train_data_21, 'train')
val_data_21 = image_relocalize(val_data_21,'validate')
test_data_21 = image_relocalize(test_data_21,'test')

train Files found: 30000
train Files not found: 0
validate Files found: 10000
validate Files not found: 0
test Files found: 30000
test Files not found: 0


In [9]:
train_data_21.drop(['class_copy', 'image'], inplace = True, axis = 1)
val_data_21.drop(['class_copy', 'image'], inplace = True, axis = 1)
test_data_21.drop(['image'], inplace = True, axis = 1)

train_data_21['usage'] = 'train'
val_data_21['usage'] = 'val'
test_data_21['usage'] = 'test'

train_data_21['year'] = 2021
val_data_21['year'] = 2021
test_data_21['year'] = 2021


# Define the desired order of column names
desired_order = [ 'image_c', 'bbox', 'year', 'usage', 'class']

# Reorder the columns of the DataFrame
Filtered_train = train_data_21[desired_order]
Filtered_val = val_data_21[desired_order]
test_data = test_data_21[desired_order]

# Combining train, val and test
data_label_2021 = pd.concat([Filtered_train, Filtered_val, test_data], ignore_index=True)
data_label_2021.rename(columns={'image_c': 'filename'}, inplace=True)
data_label_2021.reset_index(drop=True, inplace=True)

data_label_2021

Unnamed: 0,filename,bbox,year,usage,class
0,image_22142_Debris_2021.jpg,"[562, 488, 616, 547]",2021,train,Debris
1,image_04520_Debris_2021.jpg,"[529, 552, 545, 567]",2021,train,Debris
2,image_19428_Debris_2021.jpg,"[747, 668, 881, 818]",2021,train,Debris
3,image_17605_Debris_2021.jpg,"[194, 454, 337, 582]",2021,train,Debris
4,image_14818_Debris_2021.jpg,"[451, 620, 488, 683]",2021,train,Debris
...,...,...,...,...,...
69995,image_29995_2021.jpg,"[242, 404, 460, 515]",2021,test,1
69996,image_29996_2021.jpg,"[445, 358, 541, 412]",2021,test,1
69997,image_29997_2021.jpg,"[715, 408, 1024, 560]",2021,test,1
69998,image_29998_2021.jpg,"[246, 594, 311, 667]",2021,test,1


## Data Preparation - 2022

In [10]:
display(train_data_22)
display(val_data_22)

Unnamed: 0,filename,class,bbox,class_copy
0,img020297.png,debris,"[480, 643, 650, 771]",0
1,img025277.png,debris,"[663, 218, 756, 283]",0
2,img025098.png,debris,"[770, 605, 843, 674]",0
3,img025621.png,debris,"[805, 309, 870, 380]",0
4,img020295.png,debris,"[663, 534, 857, 719]",0
...,...,...,...,...
11995,img034008.png,double_star,"[27, 375, 536, 608]",1
11996,img036956.png,double_star,"[447, 508, 541, 651]",1
11997,img030150.png,double_star,"[187, 166, 864, 916]",1
11998,img036704.png,double_star,"[390, 528, 685, 658]",1


Unnamed: 0,filename,class,bbox,class_copy
0,img021146.png,debris,"[539, 229, 685, 391]",0
1,img022562.png,debris,"[471, 261, 531, 340]",0
2,img027925.png,debris,"[511, 614, 585, 681]",0
3,img027323.png,debris,"[441, 687, 518, 748]",0
4,img028632.png,debris,"[789, 462, 859, 521]",0
...,...,...,...,...
3995,img033343.png,double_star,"[420, 153, 684, 611]",1
3996,img039462.png,double_star,"[476, 306, 625, 572]",1
3997,img037319.png,double_star,"[417, 425, 624, 759]",1
3998,img039601.png,double_star,"[538, 386, 692, 589]",1


In [11]:
# Appending year to original image name and changing file extension
train_data_22['filename'] = train_data_22['filename'].str[:-4]
train_data_22['filename'] = train_data_22['filename'] + '_2022.jpg'

val_data_22['filename'] = val_data_22['filename'].str[:-4]
val_data_22['filename'] = val_data_22['filename'] + '_2022.jpg'

In [12]:
def image_transfer(data, flag):
    filenames_data_df = pd.DataFrame() 
    filenames_data_df['filename'] = data['filename']
    
    if (flag == 'train'):
        folder_2022_train = "2022/train"
        master_train_folder = "master_data/train"
    else:
        folder_2022_val = "2022/val"
        master_val_folder = "master_data/validate"
    
    for filename in filenames_data_df['filename']:    
        finalName = filename
        filename = filename[:-9]
        filename = filename +'.jpg'
        
        if (flag == 'train'):
            source_path = os.path.join(folder_2022_train, filename)
            destination_path = os.path.join(master_train_folder, finalName)
        else:
            source_path = os.path.join(folder_2022_val, filename)
            destination_path = os.path.join(master_val_folder, finalName)
        
        if os.path.exists(source_path):
            shutil.copy(source_path, destination_path)
    
    print("Images transferred.")

image_transfer(train_data_22,'train')
image_transfer(val_data_22,'validate')

Images transferred.
Images transferred.


In [13]:
train_data_22['usage'] = 'train'
val_data_22['usage'] = 'val'
test_data_22['usage'] = 'test'

train_data_22['year'] = 2022
val_data_22['year'] = 2022
test_data_22['year'] = 2022


test_data_22['bbox'] = np.nan
test_data_22['class'] = np.nan

# Define the desired order of column names
desired_order = ['filename', 'bbox', 'year','usage','class']

train_data_22 = train_data_22[desired_order]
val_data_22 = val_data_22[desired_order]
test_data_22 = test_data_22[desired_order]

data_label_2022 = pd.concat([train_data_22, val_data_22, test_data_22], ignore_index=True)
data_label_2022.reset_index(drop=True, inplace=True)

data_label_2022


Unnamed: 0,filename,bbox,year,usage,class
0,img020297_2022.jpg,"[480, 643, 650, 771]",2022,train,debris
1,img025277_2022.jpg,"[663, 218, 756, 283]",2022,train,debris
2,img025098_2022.jpg,"[770, 605, 843, 674]",2022,train,debris
3,img025621_2022.jpg,"[805, 309, 870, 380]",2022,train,debris
4,img020295_2022.jpg,"[663, 534, 857, 719]",2022,train,debris
...,...,...,...,...,...
37995,img022385.png,,2022,test,
37996,img022129.png,,2022,test,
37997,img025998.png,,2022,test,
37998,img023907.png,,2022,test,


## Exporting the csv

In [15]:
# Exporting 2021 dataset
data_label_2021.to_csv('master_data/label/data_label_2021.csv', index = False)
data_label_2022.to_csv('master_data/label/data_label_2022.csv', index=False)

final_data_label = pd.concat([data_label_2021, data_label_2022], ignore_index=True)

final_data_label.to_csv('master_data/label/final_data_label.csv', index=False)
