# 0 - Information

# 1 - Packages

In [1]:
# Maths packages
import numpy as np

# Dataset packages
import pandas as pd

# Import os packages
from os import listdir
from os.path import isfile, join
import re

# Import progress bar
from tqdm import tqdm

# 2 - Read Data

## 2.1 - Load the Data

In [2]:
def readData(filename):
    """Return the data of the file filename and save them as a numpy array."""
    
    # Resulting dct
    dct = {}
    
    with open(filename) as f:
        
        # Read the first line
        N = f.readline().split()
        
        # Append dct
        dct["N"] = N
        
        # Resulting dataFrame
        df = pd.DataFrame(columns=["ID", "H_V", "M", "Tags"])
        
        # Resulting convertion table
        table = {}
        
        # Counter
        counter = 0
        
        # Read the other lines and save them in 
        for i, line in enumerate(f):
            
            # Extract info
            string = line.strip().split()
            infos = [i, string[0], int(string[1]), " ".join(string[2:])]
            
            # Extract tags
            tags = string[2:]
            for tag in tags:
                
                # Test if present in table
                if not(tag in table):
                    table[tag] = counter
                    counter += 1

            # Add the infos to df
            df.loc[i] = infos
        
        # Add df to dct and table
        dct["Photos"] = df
        dct["Table"] = table

    return dct

In [3]:
def readFiles(folder):
    """Read the data in folder and save them in a dict."""
    
    # Resulting dict
    result_dct = {}
    
    # List of files in folder
    files = [f for f in listdir(folder) if isfile(join(folder, f))]
    
    # Loop over all the files in folder
    for file in tqdm(files):
        
        # Read the data and save them in dct
        file_name = re.sub("\...", "", file)
        result_dct[file_name] = readData(join(folder, file))
        
    return result_dct

In [4]:
# Read the data
data_dct = readFiles("Data")

100%|██████████| 5/5 [33:17<00:00, 447.61s/it]


## 2.2 - Convert String as integer

In [5]:
def convertTags(tags_str, table):
    """Convert a string of tags into a string of integer trough table."""
    
    # Split str
    tags_l = tags_str.split()
    
    # Loop of convertion
    tags_int_l = [str(table[tag]) for tag in tags_l]
    
    # Convert as a string
    result = " ".join(tags_int_l)
    
    return result

In [15]:
def convertAsInt(dct):
    """Convert tag as int."""
    
    for key in tqdm(dct.keys()):
        
        # Extract df and table of dct
        df = dct[key]["Photos"]
        table = dct[key]["Table"]
        
        # Definition of the lambda function
        convert = lambda tags_str : convertTags(tags_str, table)
        
        # Add a colum with integers
        df["Tags_Int"] = df["Tags"].apply(lambda x: convert(x))
        
        # Update df of dct
        dct[key] = df
        
    return dct

In [19]:
# Read the data
# data_dct = convertAsInt(data_dct)
# test_dct = convertAsInt({"a": data_dct["c_memorable_momentst"]})

# # Extract one example
# test_dct = data_dct["c_memorable_momentst"]

# Display test_dct
# print(data_dct.keys())

100%|██████████| 5/5 [00:01<00:00,  2.21it/s]

dict_keys(['a_examplet', 'b_lovely_landscapest', 'c_memorable_momentst', 'd_pet_picturest', 'e_shiny_selfiest'])





In [20]:
# Save the dictionnary
# np.save('./Save/data_dct.npy', data_dct) 
# np.save('./Save/test_dct.npy', test_dct) 

In [23]:
# Load
data_dct = np.load('./Save/data_dct.npy').item()
test_dct = np.load('./Save/test_dct.npy').item()

# Display test_dct
data_dct["a_examplet"]

Unnamed: 0,ID,H_V,M,Tags,Tags_Int
0,0,H,3,cat beach sun,0 1 2
1,1,V,2,selfie smile,3 4
2,2,V,2,garden selfie,5 3
3,3,H,2,garden cat,5 0


# 3 - Optimisation

In [None]:
def prediction(dataset_dct):
    """Make prediction for the current dataset_dct."""
    
    # Resulting array
    results = []
    
    # Extract N
    N = dct["N"]
    
    # Extract Photos
    photos_df = dct["Photos"]
    
    # Predictions
    for i in range(len(photos_df)):
        results.append(photos_df.iloc[i,:]["ID"])
                
    return results

In [None]:
def predictionsDct(data_dct):
    """Extract the data of each file in dct and compute the predictions."""
    
    # Resulting dct
    predictions_dct = {}
    
    # Loop over the different datasets
    for key in data_dct.keys():
        
        # Compute the predictions for the current datasets
        predictions_dct[key] = prediction(data_dct[key])
        
    # Return the predictions made
    return predictions_dct

In [None]:
# Compute the predictions
predictions_dct = predictionsDct(test_dct)

# 4 - Save Predictions

In [None]:
def writePredictions(predictions_dct, folder="Results/"):
    """Save the predictions."""
    
    # Loop over all the predictions save in array_dct
    for key in predictions_dct.keys():
        
        # Extract predictions for the given files
        photos = predictions_dct[key]

        # Count the number of photos in the slideshow
        nb_photos = len(photo)
        
        # Writes the result in a txt file
        f = open(folder + key + ".txt", "w")
        
        # Write number of slices
        f.write(str(nb_photos) + "\n")
        
        # Loop over each slices
        for ID in photos:
            f.write(str(ID) + "\n")
            
        # Closing the file
        f.close()

In [None]:
writePredictions(predictions_dct)