# This notebook is majorly for down-loading and viewing the images and creating the dataset



In [None]:
#importing important libraries
import requests
import os
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
import matplotlib.pyplot as plt
import numpy as np
from google.colab import files
import cv2

In [None]:
# checks the validity of the url
# url should have netloc(domain name) and scheme(protocol)
def is_valid(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

In [None]:
# makes the url of a particular year and day
def get_url_from_day_year (year,day):
  url = "https://pdsimage2.wr.usgs.gov/archive/mess-e_v_h-mdis-2-edr-rawdata-v1.0/MSGRMDS_1001/DATA/" + str(year) + '_' + str(day) + '/'
  return url

In [None]:
# return a list of all the IMG files which are their on a url of particular year and day.
# here input is a particular url which is of that particular year and day.
def get_all_images(url):
# html parser is used 
    soup = bs(requests.get(url).content, "html.parser")
    urls = []
    
# tqdm is used to see the progress of the loop
    for img in tqdm(soup.find_all("a"), "Extracting images"):
        img_url = img.attrs.get("href")
        IMG_checker = "IMG"

# IMG_checker is used to filter href link that contain .IMG format data. 
# make the URL absolute by joining domain with the URL that is just extracted
# the url is checked for it's validity
        if IMG_checker in img_url:
          img_url = urljoin(url, img_url)
          if is_valid(img_url):
            urls.append(img_url)           
    return urls

In [None]:
# return a list of urls of all .IMG format image data files of a particular year and day.
# here input is year,day

def images_list (year,day):
  url = get_url_from_day_year (year,day)
  images_url_list = get_all_images(url)
  return images_url_list

In [None]:
# download function download an image and save it at the pathname
# url of a particular image file should be given
def download(url, pathname):

    # if path doesn't exist, make that path dir
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
    # download the body of response by chunk, not immediately
    response = requests.get(url, stream=True)
    # get the total file size
    file_size = int(response.headers.get("Content-Length", 0))
    # get the file name
    filename = os.path.join(pathname, url.split("/")[-1])
    # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
    progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
    with open(filename, "wb") as f:
        for data in progress:
            # write data read to the file
            f.write(data)
            # update the progress bar manually
            progress.update(len(data))

In [None]:
# this function download all the image data files of a particular year and day 
# and stores all of them at pathname 
def retrive_all_images (year,day,pathname):
  images_url_list = images_list(year,day)
  for url in images_url_list:
    download(url,pathname)

In [None]:
# this function gives url of a particular image data file of a particular year and day
# particular image data file should be given with .IMG format.
def get_specific_url(year,day,file_name):
  images_url_list = images_list(year,day)
  for images_url in images_url_list:
    if file_name in images_url:
      return images_url
  return (" This file does not exist.")

In [None]:
# removes the .IMG from the file name 
def file_num (string):
  final_str = ""
  for character in string:
    if (character == "."):
      return final_str
    final_str = final_str + character    

In [None]:
# shape_function returns a type of shape which is needed for reshaping the array, this has been done for only 4 cases,
def shape_function (data):
  if len(data) == 134656 : 
    return (526,256)
  if len(data) == 527872 :
    return (1031,512)
  if len(data) == 1052672:
    return (1028,1024)
  if len(data) == 528384 :
    return (688,768)

In [None]:
# this function is almost similar to a previous function open_specific_image the only change is this function directly takes the input of the image data file url.
# the image file can again be saved by uncommenting output_filename and plt.save()
def specific_image(file_path,url):
  download(url,file_path)

  # Parameters.
  input_filename = file_path + '/' + url.split("/")[-1]
  print(input_filename)
  dtype = np.dtype('>u2') # big-endian unsigned integer (16bit)
  #output_filename = jpeg_path + '/' + str(file_num(url.split("/")[-1])) + '.JPG'
  #print(output_filename)

  # Reading.
  fid = open(input_filename, 'rb')
  data = np.fromfile(fid, dtype)
  shape = shape_function(data) # matrix size
  image = data.reshape(shape)

  # Display.
  plt.figure(figsize=(8,5), dpi= 250)
  plt.imshow(image, cmap = "gray", )
  #plt.savefig(output_filename)
  plt.show()
  return data,image

In [None]:
# this function return the maximum value in the image data file, it's location and the mean of all the data in image data file
def maximum_index (image) : 
  list12 = []
  for index in image:
    list12.append(index.max())
  maximum  = np.max(list12)
  list11 = []
  for index in range(len(image)):
    if (maximum in image[index]):
      for number in range(len(image[index])):
        if maximum == image[index][number]:
          list11.append((index,number))
  print(maximum,list11,np.mean(image))

In [None]:
# list of all urls of image data is created for year 2011 and day 155
images_2011_155 = images_list(2011,155)
print(len(images_2011_155))

# the list of url are looped and passed to specific_image function and plotted
# and the image_data is also passed to maximum_index function 
for urls in images_2011_155:
  data,image = specific_image("/content/images",urls)
  maximum_index(image)

# the list_of_url is exported in .csv format 
url_dataframe = pd.DataFrame(images_2011_155)
url_dataframe.to_csv('/content/url.csv',index = False,header= False)

# the same is done for year 2011, days 156,157,158

In [None]:
# the four days images data are stored in four different folders images, images1, images2, images3
# and exported in 4 different .csv files named url,url1,url2,url3

In [None]:
images_2011_156 = images_list(2011,156)
len(images_2011_156)
for urls in images_2011_156:
  data,image = specific_image("/content/images1",urls)
  maximum_index(image)
url_dataframe = pd.DataFrame(images_2011_156)
url_dataframe.to_csv('/content/url1.csv',index = False,header= False)

In [None]:
images_2011_157 = images_list(2011,157)
print(len(images_2011_157))
for urls in images_2011_157:
  data,image = specific_image("/content/images2",urls)
  maximum_index(image)
url_dataframe = pd.DataFrame(images_2011_157)
url_dataframe.to_csv('/content/url2.csv',index = False,header= False)

In [None]:
images_2011_158 = images_list(2011,158)
for urls in images_2011_158:
  data,image = specific_image("/content/images3",urls)
  maximum_index(image)

url_dataframe = pd.DataFrame(images_2011_158)
url_dataframe.to_csv('/content/url3.csv',index = False,header= False)

In [None]:
# now based on data above all the data from 4th to 7th of june 2011 have been classified with manually checking of the images and with
# knowing which at which point highest image data is there and where is it
# these classification are saved in the previously exported files
# and have the same name same name of exported ones. url,url1,url2,url3 

In [None]:
#the whole train and test data set has been made through data in these four .csv files

### **COSMIC RAY ARTIFACT - 1**
### **NOT AN ARTIFACT -  0**

### The .csv files with classfication can be founded here : https://github.com/VIDIT-OSTWAL/CRISMIS

In [None]:
csv = pd.read_csv("/content/url.csv",header = None,index_col=  False,names = ['File Name','Classification'])
csv1 = pd.read_csv("/content/url1.csv",header = None,index_col=  False,names = ['File Name','Classification'])
csv2= pd.read_csv("/content/url2.csv",header = None,index_col=  False,names = ['File Name','Classification'])
csv3 = pd.read_csv("/content/url3.csv",header = None,index_col=  False,names = ['File Name','Classification'])