In [None]:
from google.colab import drive

# # Install necessary libraries
!pip install jcamp -q
!pip install rdkit -q

# Import libraries

import os
import re
from time import sleep
import pandas as pd
from rdkit import Chem
from jcamp import jcamp_read
import requests
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

# Constants
REQUEST_SLEEP = 1
DATASET_PATH = '../DATASET_PATH/'

# Mount Google Drive
drive.mount('/content/drive')


def download_nist_spectrum(cas_number):
    """Downloads an IR spectrum from NIST for a given CAS number.

    Args:
        cas_number: The CAS number of the compound.
    """
    url_target = f"https://webbook.nist.gov/cgi/cbook.cgi?JCAMP=C{cas_number.replace('-','')}&Index=0&Type=IR"
    cas_file = cas_number + '-IR.jdx'
    if cas_file in os.listdir(DATASET_PATH):
      print(f'{cas_number}: IR Spectrum already downloaded')
      return 1

    else:

      response = requests.get(url_target)
      if response.status_code == 200:
          with open(DATASET_PATH + cas_number+'-IR' +'.jdx', 'wb') as data:
            data.write(response.content)
            print(f'{cas_number}: IR Spectrum downloaded')
      elif response.status_code == 404:
        print(f'{cas_number}: IR Spectrum not found')
      else:
        raise Exception(f'Error downloading IR spectrum: {response.status_code} - CAS:{cas_number}')


      response.close()

      del response

      sleep(REQUEST_SLEEP)

      return 0


def generate_cas_combinations(start1: int, end1: int, start2: int, end2: int, start3: int, end3: int) -> list:
    """Generates a list of CAS number combinations."""
    return [f"{i}-{j}-{k}" for i in range(start1, end1) for j in range(start2, end2) for k in range(start3, end3)]


# # Example usage
# cas_numbers = generate_cas_combinations(161, 513, 10, 99, 0, 10)
# for i, cas in enumerate(cas_numbers):
#     print(f"{i} - {i/len(lista)*100:.2f}")
#     download_nist_spectrum(lista[-i])



# # Example usage
# cas_numbers = generate_cas_combinations(161, 513, 10, 99, 0, 10)
# for i, cas in enumerate(cas_numbers):
#     print(f"{i} - {i/len(lista)*100:.2f}")
#     download_nist_spectrum(lista[-i])


# lista = ['15872-42-1',
#  '3222-47-7',
#  '10498-35-8',
#  '456-22-4',
#  '7435-83-8',
#  '122-78-1',
#  '76-12-0',
#  '55406-13-8',
#  '2525-16-8',
#  '93409-03-1',
#  '5370-25-2',
#  '95166-04-4']
# for i, cas in enumerate(lista):
#   print(f"{i} - {i/len(lista)*100:.2f}")
#   download_nist_spectrum(lista[-i])



print(f"Number of files in directory: {len(os.listdir(DATASET_PATH))}")




  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m767.1 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.8/259.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for jcamp (setup.py) ... [?25l[?25hdone
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m34.3/34.3 MB[0m [31m86.7 MB/s[0m eta [36m0:00:01[0m