In [None]:
#importing important libraries
import requests
import os
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
import matplotlib.pyplot as plt
import numpy as np
from google.colab import files
import cv2

In [None]:
# checks the validity of the url
# url should have netloc(domain name) and scheme(protocol)
def is_valid(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

In [None]:
# makes the url of a particular year and day
def get_url_from_day_year (year,day):
  url = "https://pdsimage2.wr.usgs.gov/archive/mess-e_v_h-mdis-2-edr-rawdata-v1.0/MSGRMDS_1001/DATA/" + str(year) + '_' + str(day) + '/'
  return url

In [None]:
# return a list of all the IMG files which are their on a url of particular year and day.
# here input is a particular url which is of that particular year and day.
def get_all_images(url):
# html parser is used 
    soup = bs(requests.get(url).content, "html.parser")
    urls = []
    
# tqdm is used to see the progress of the loop
    for img in tqdm(soup.find_all("a"), "Extracting images"):
        img_url = img.attrs.get("href")
        IMG_checker = "IMG"

# IMG_checker is used to filter href link that contain .IMG format data. 
# make the URL absolute by joining domain with the URL that is just extracted
# the url is checked for it's validity
        if IMG_checker in img_url:
          img_url = urljoin(url, img_url)
          if is_valid(img_url):
            urls.append(img_url)           
    return urls

In [None]:
# return a list of urls of all .IMG format image data files of a particular year and day.
# here input is year,day

def images_list (year,day):
  url = get_url_from_day_year (year,day)
  images_url_list = get_all_images(url)
  return images_url_list

In [None]:
# shape_function returns a type of shape which is needed for reshaping the array, this has been done for only 4 cases,
def shape_function (data):
  if len(data) == 134656 : 
    return (526,256)
  if len(data) == 527872 :
    return (1031,512)
  if len(data) == 1052672:
    return (1028,1024)
  if len(data) == 528384 :
    return (688,768)


### **The data from the previous notebook will be used to make models and predict their efficiency**
### **the dataset consists of only 4 days data of year 2011, from 4th June to 7th June.**

The classification .csv file csn be founded here : https://github.com/VIDIT-OSTWAL/CRISMIS

In [None]:
csv = pd.read_csv("/content/url.csv",header = None,index_col=  False,names = ['File Name','Classification'])
csv1 = pd.read_csv("/content/url1.csv",header = None,index_col=  False,names = ['File Name','Classification'])
csv2= pd.read_csv("/content/url2.csv",header = None,index_col=  False,names = ['File Name','Classification'])
csv3 =  pd.read_csv("/content/url3.csv",header = None,index_col=  False,names = ['File Name','Classification'])

In [None]:
# again list of all the urls of a particular year and day is created for all the days

images_2011_155 = images_list(2011,155)
images_2011_156 = images_list(2011,156)
images_2011_157 = images_list(2011,157)
images_2011_158 = images_list(2011,158)
dtype = np.dtype('>u2') # big-endian unsigned integer (16bit)

Extracting images: 100%|██████████| 163/163 [00:00<00:00, 32398.42it/s]
Extracting images: 100%|██████████| 80/80 [00:00<00:00, 26942.69it/s]
Extracting images: 100%|██████████| 123/123 [00:00<00:00, 33644.15it/s]
Extracting images: 100%|██████████| 184/184 [00:00<00:00, 23686.45it/s]


In [None]:
# this function opens a .IMG format file, first reshape it with the shape_function and then resizes it in a (128,128) array, 
# and then appends that array to a list
# it also finds the corresponding classificaiton made agaisnt that particular file and appends that file to another list
# interpolation used for resizing is cv2.INTER_CUBIC

def making_list_of_data (path_name,list_of_images,list_of_data,list_of_classification,csv):
  for url in list_of_images:
    file_name  = path_name + url.split("/")[-1]
    fid = open(file_name,'rb')
    data = np.fromfile(fid,dtype)
    shape = shape_function(data) # matrix size
    image = data.reshape(shape)
    res = cv2.resize(image, dsize=(128,128), interpolation=cv2.INTER_CUBIC)
    list_of_data.append(res)
    value =  int(csv[csv['File Name'] == url.split("/")[-1]]['Classification'])
    list_of_classification.append(value)


In [None]:
list_of_resized_data = []
list_of_classification = []

making_list_of_data("/content/images/content/images/",images_2011_157,list_of_resized_data,list_of_classification,csv)
making_list_of_data("/content/images1/content/images1/",images_2011_156,list_of_resized_data,list_of_classification,csv1)
making_list_of_data("/content/images2/content/images2/",images_2011_155,list_of_resized_data,list_of_classification,csv2)
making_list_of_data("/content/images3/",images_2011_158,list_of_resized_data,list_of_classification,csv3)

#all the arrays of image file is appended to list_of_resized_data and classification are appended to list_of_classification


In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D,Flatten,InputLayer, BatchNormalization
from keras.utils import np_utils
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# the previous list are made into nd.arrays
new_array = np.array(list_of_resized_data)
new_classification = np.array(list_of_classification)

In [None]:
# new_array is reshaped into (530,128*128) flassten to 1D
# train and test dataset have been formed with test_size of 0.3

new_array = new_array.reshape(530,16384)
new_classification = new_classification.astype('float32')

X_train,X_test,y_train,y_test = train_test_split(new_array,new_classification,test_size = 0.3,random_state = 42)

# *A SIMPLE NEURAL NETWORK*

In [None]:
# bluiding a simple neural network for prediction

model = Sequential()
# hidden layer
model.add(Dense(100, input_shape=(16384,), activation='relu'))
# output layer
model.add(Dense(10, activation='softmax'))
model.summary()
# compiling the sequential model
model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
# training the model for 30 epochs
model.fit( X_train, y_train, batch_size=32, epochs=30, validation_data=(X_test,y_test))

Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_61 (Dense)             (None, 100)               1638500   
_________________________________________________________________
dense_62 (Dense)             (None, 10)                1010      
Total params: 1,639,510
Trainable params: 1,639,510
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fbe4e615510>

# The NN model : the accuracy of this model (at the time of running)

# acc = 0.7505

# val_acc = 0.8365
# But the loss function has not converged (the code was run multiple times) , therefore this model accuracy and predictions can not be trusted

## **A SIMPLE CONVOLUTIONAL NEURAL  NETWORK **

In [None]:
new_array = np.array(list_of_resized_data)
new_classification = np.array(list_of_classification)

# the new_array is reshaped to (number_of_example,128,128,1)

new_array = new_array.reshape(new_array.shape[0],128,128,1)
new_classification = new_classification.astype('float32')
X_train,X_test,y_train,y_test = train_test_split(new_array,new_classification,test_size = 0.3,random_state = 42)

In [None]:
# building a linear stack of layers with the sequential model
model = Sequential()
# convolutional layer
model.add(Conv2D(25, kernel_size=(3,3), strides=(1,1), padding='valid', activation='relu', input_shape=(128,128,1)))
model.add(MaxPool2D(pool_size=(1,1)))
# flatten output of conv
model.add(Flatten())
# hidden layer
model.add(Dense(100, activation='relu'))
# output layer
model.add(Dense(10, activation='sigmoid'))

In [None]:
# compiling the sequential model
model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# training the model for 10 epochs
model.fit(X_train, y_train, batch_size=32, epochs=30, validation_data=(X_test, y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fbe4812f810>

# A simple CNN network the loss has convered to a great extent the accuracy in train_datset is 0.9986 and in the test_dataset is around 0.8365
the lost function in case of test_dataset has not convered to a great extent thus some improvment is required 




# *A DEEP CONVOLUTIONAL NEURAL NETWORK*

In [None]:
# build a sequential model
model = Sequential()
model.add(InputLayer(input_shape=(128, 128, 1)))

# 1st conv block
model.add(Conv2D(25, (5, 5), activation='relu', strides=(1, 1), padding='same'))
model.add(MaxPool2D(pool_size=(2, 2), padding='same'))
# 2nd conv block
model.add(Conv2D(50, (5, 5), activation='relu', strides=(2, 2), padding='same'))
model.add(MaxPool2D(pool_size=(2, 2), padding='same'))
model.add(BatchNormalization())
# 3rd conv block
model.add(Conv2D(70, (3, 3), activation='relu', strides=(2, 2), padding='same'))
model.add(MaxPool2D(pool_size=(2, 2), padding='valid'))
model.add(BatchNormalization())
# ANN block
model.add(Flatten())
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dropout(0.25))
# output layer
model.add(Dense(units=10, activation='sigmoid'))

# compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
# fit on data for 30 epochs
model.fit(X_train,y_train, epochs=30, validation_data=(X_test,y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fbe45f344d0>

# A deep CNN network has performed very well, the loss function of both train and validation have converged to a great extent and also the accuracy
# train_acc = 0.9641
# val_acc =  0.9623

