# Basic Libraries II

### Exercise


Reusing the same annotations we work with in the previous session, answer the following items using the libraries we saw today: 

1. How many annotations you have per month and year. Which month has more annotation files.

In [1]:
import glob
import os
import re
from datetime import datetime
import pandas as pd

In [2]:
annotations_path = 'annotations/*.txt'
pattern = re.compile(r'(\d{8})_(\d{6})_SN(\d+)_QUICKVIEW_VISUAL_([\d_]+)_([A-Za-z0-9\-_.]+)\.txt')

# We create a list to store annotation dates
annotation_dates = []

annotations = glob.glob(annotations_path)

# We extract dates from each filename
for annotation in annotations:
    filename = os.path.basename(annotation)
    match = re.match(pattern, filename)
    if match:
        date, time, _, _, _ = match.groups()
        date_obj = datetime.strptime(date, "%Y%m%d")
        annotation_dates.append(date_obj)

# We convert the list of dates into a DataFrame
df = pd.DataFrame(annotation_dates, columns=['date'])

# We extract year and month from date and add them to the DataFrame
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month

# We group by year and month to count  the annotations
annotations_per_month = df.groupby(['Year', 'Month']).size().reset_index(name='Amount')

max_annotations = annotations_per_month.loc[annotations_per_month['Amount'].idxmax()]

print("Annotations per month and year:")
print(annotations_per_month)
print(f"\nThe month with the most files is 0{max_annotations['Month']}-{max_annotations['Year']} with {max_annotations['Amount']} annotations.")


Annotations per month and year:
   Year  Month  Amount
0  2024      1      27
1  2024      2      45
2  2024      3      17
3  2024      4      25
4  2024      5      28
5  2024      6      52

The month with the most files is 06-2024 with 52 annotations.


2. Create a dictionary where each **key** is a month, and the corresponding **value** is a list containing all the annotation names with where their date corresponds to the month. 

    a. Save it following the json format, and load it again to check that everything is ok.

In [3]:
import json

# We initialize a dictionary to store annotations by month
annotations_by_month = {}

pattern = re.compile(r'(\d{8})_(\d{6})_SN(\d+)_QUICKVIEW_VISUAL_([\d_]+)_([A-Za-z0-9\-_.]+)\.txt')
annotations = glob.glob(annotations_path)

# We add the annotations to the dictionary
for annotation in annotations:
    filename = os.path.basename(annotation)
    match = re.match(pattern, filename)
    if match:
        date = match.group(1)
        date_obj = datetime.strptime(date, '%Y%m%d')
        month_year_str = date_obj.strftime('%B %Y')    # As a key, we create a string of month and year in case we had more than one year.

        if month_year_str not in annotations_by_month:
            annotations_by_month[month_year_str] = []
        
        annotations_by_month[month_year_str].append(filename)

# We save the dictionary as a JSON file
with open('annotations_by_month.json', 'w') as f:
    json.dump(annotations_by_month, f)

with open('annotations_by_month.json', 'r') as f:
    loaded_annotations = json.load(f)

print("Loaded annotations:") # We check the loaded annotations
loaded_annotations


Loaded annotations:


{'January 2024': ['20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt',
  '20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt',
  '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4162.txt',
  '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt',
  '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_554_4162.txt',
  '20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_392_3740.txt',
  '20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_392_3742.txt',
  '20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_396_3752.txt',
  '20240102_185527_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_740_3850.txt',
  '20240102_185605_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_690_3572.txt',
  '20240102_185954_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_414_3786.txt',
  '20240104_220339_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_556_4178.txt',
  '20240110_192002_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_380_3728.tx

b. Save it this time using Pickle.

In [4]:
import pickle

with open('annotations_by_month.pkl', 'wb') as f:
    pickle.dump(annotations_by_month, f)

with open('annotations_by_month.pkl', 'rb') as f:
    loaded_annotations = pickle.load(f)

print("Loaded annotations from pickle:")   # We check the loaded annotations
loaded_annotations


Loaded annotations from pickle:


{'January 2024': ['20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt',
  '20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt',
  '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4162.txt',
  '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt',
  '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_554_4162.txt',
  '20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_392_3740.txt',
  '20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_392_3742.txt',
  '20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_396_3752.txt',
  '20240102_185527_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_740_3850.txt',
  '20240102_185605_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_690_3572.txt',
  '20240102_185954_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_414_3786.txt',
  '20240104_220339_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_556_4178.txt',
  '20240110_192002_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_380_3728.tx

c. Instead of storing a list of all the annotation names happening that month, let's create for each annotation a dictionary with keys: name and date (using a datetime object).

In [5]:
annotations_month = {}

# Extract dates and filenames
for annotation in annotations:
    filename = os.path.basename(annotation)
    match = re.match(pattern, filename)
    if match:
        date, time, _, _, _ = match.groups()
        datetime_str = date + time 
        datetime_obj = datetime.strptime(datetime_str, "%Y%m%d%H%M%S")
        annotation_dict = {'name': filename, 'date': datetime_obj}

        if month_year_str not in annotations_month:
            annotations_month[month_year_str] = []
        
        annotations_month[month_year_str].append(annotation_dict)


In [6]:
# Now we can try saving the dictionary to a JSON file
with open('annotations_month.json', 'w') as json_file:
    json.dump(annotations_month, json_file, default=str)

# Load the dictionary from the JSON file to check
with open('annotations_month.json', 'r') as json_file:
    loaded_annotations = json.load(json_file)

print("Loaded annotations from JSON:")
loaded_annotations

Loaded annotations from JSON:


{'June 2024': [{'name': '20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt',
   'date': '2024-01-01 17:43:01'},
  {'name': '20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt',
   'date': '2024-01-01 17:43:01'},
  {'name': '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4162.txt',
   'date': '2024-01-01 19:28:56'},
  {'name': '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt',
   'date': '2024-01-01 19:28:56'},
  {'name': '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_554_4162.txt',
   'date': '2024-01-01 19:28:56'},
  {'name': '20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_392_3740.txt',
   'date': '2024-01-01 21:36:01'},
  {'name': '20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_392_3742.txt',
   'date': '2024-01-01 21:36:01'},
  {'name': '20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_396_3752.txt',
   'date': '2024-01-01 21:36:01'},
  {'name': '20240102_185527

3. Print all the annotations from the oldest ones to the newest one during the seconf half of the 2024. 

In [7]:
second_half_2024 = []

for annotation in annotations:
    filename = os.path.basename(annotation)
    match = re.match(pattern, filename)
    if match:
        date_str = match.group(1)
        date_obj = datetime.strptime(date_str, '%Y%m%d')
        if datetime(2024, 7, 1) <= date_obj <= datetime(2024, 12, 31):  # We filter the dates for only the second part of 2024
            second_half_2024.append((filename, date_obj))

df = pd.DataFrame(second_half_2024, columns=['filename', 'date'])
df_sorted = df.sort_values(by='date')

if df_sorted.empty:
    print("No annotations from the second half of 2024.")
else:
    print("Annotations from the second half of 2024, sorted from oldest to newest:")
    for index, row in df_sorted.iterrows():
        print(row['filename'])

No annotations from the second half of 2024.


In [8]:
#We run the code with the first half of 2024 just to check that it works.
first_half_2024 = []

for annotation in annotations:
    filename = os.path.basename(annotation)
    match = re.match(pattern, filename)
    if match:
        date_str = match.group(1)
        date_obj = datetime.strptime(date_str, '%Y%m%d')
        if datetime(2024, 1, 1) <= date_obj <= datetime(2024, 6, 30):
            first_half_2024.append((filename, date_obj))

df = pd.DataFrame(first_half_2024, columns=['filename', 'date'])
df_sorted = df.sort_values(by='date')

if df_sorted.empty:
    print("No annotations from the second half of 2024.")
else:
    print("Annotations from the second half of 2024, sorted from oldest to newest:")
    for index, row in df_sorted.iterrows():
        print(row['filename'])

Annotations from the second half of 2024, sorted from oldest to newest:
20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt
20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt
20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4162.txt
20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt
20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_554_4162.txt
20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_392_3740.txt
20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_392_3742.txt
20240101_213601_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_396_3752.txt
20240102_185527_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_740_3850.txt
20240102_185605_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_690_3572.txt
20240102_185954_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_414_3786.txt
20240104_220339_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_556_4178.txt
20240110_192002_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_380_3728.txt
20240