# Combining Data

The purpose of this notebook is to make one giant CSV module, out of the many image files in the dataset, which are scattered across different folders.

## Imports

In [1]:
import pandas as pd
import glob
import rarfile

## Creating a DataFrame for the Images in Each Folder

This is a **binary classification** problem. Here is what the labels I'm assigning mean:

- 0 - there is NO FIRE detected in the image
- 1 - there is a FIRE detected in the image

In [2]:
def make_df(folder, label):
    """Returns a pandas.DataFrame object from the RAR folder.
      
       Parameters:
       folder(RarFile): Bytes-like object oof the RAR directory
       label(int): binary classification for all images in this
                   directory
                   
       Returns: pandas.DataFrame object of the data
                Columns: ['Folder', 'filename', 'label']
    
    """
    # read data from the folder
    with folder as f:
        # list of entire file paths
        file_paths = f.namelist()
        # list of folder names separated from file names
        ls_folders, ls_files, ls_labels = list(), list(), list()
        # last item is folder is the folder itself, so we ignore last item
        for path in file_paths[:-1]:
            folder, file = path.split('/')
            # add values for each list
            ls_folders.append(folder)
            ls_files.append(file)
            ls_labels.append(label)
        # create a DataFrame
        data = {
            'Folder': ls_folders,
            'filename': ls_files,
            'label': ls_labels
        }
        df_fire = pd.DataFrame(data)
        return df_fire

### Collect DataFrames for All Folders Together

In [3]:
def add_df_to_list(is_normal):
    """Add DataFrame objects to an overall collection.
    
       Parameter:
       is_normal(bool): determines kind of paths to search,
                        and label to put in DataFrames created
                        
       Returns:
       list: ls_df is a list of the DataFrame objects
            created
    
    """
    ls_df = list()
    # define variables for type of images being added
    data_path = 'Fire-Detection-Image-Dataset/'
    if is_normal is True:
        data_path += 'Normal Images*'
        label = 0
    else:
        data_path += 'Fire Images*'
        label = 1
    # iterate over directories of this image type
    for path in glob.glob(data_path):
        folder = rarfile.RarFile(path)
        df = make_df(folder, label)
        ls_df.append(df)
    return ls_df


# add all the images with no fire
all_df = add_df_to_list(True)
# add the images with fire
fire_df = add_df_to_list(False)
all_df.append(fire_df)