# #4 Emissions Datathon - Permian, Mean Pixel values and CNN

### Prepare Dataframe

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path

# Data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams.update({'figure.max_open_warning': 0})
from matplotlib.pyplot import *
import imageio
from skimage.io import imread

from tqdm import tqdm

import cv2   #For image processing

import tensorflow as tf
#All tensorflow utilities for creating, training and working with a CNN
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, BatchNormalization
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model



from sklearn.preprocessing import LabelEncoder    #For encoding categorical variables
from sklearn.model_selection import train_test_split #For splitting of data
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from skimage.io import imread, imshow
from skimage.color import rgb2gray, rgb2hsv, rgba2rgb
from skimage.morphology import area_opening
from skimage.exposure import histogram
from skimage.filters import threshold_otsu
from skimage import io, color

In [2]:
image_train = Path('C:/Users/vasan/OneDrive/EmissionsDatathon/Dataset/Data/Train2a')

dir_name = r'C:\Users\vasan\OneDrive\EmissionsDatathon\Zeroing Methane Emissions - Dataset'
filename_suffix = 'csv'

In [3]:
filepaths = pd.Series(list(image_train.glob(r'**/*.tif')), name='Filepath').astype(str)

In [4]:
filepaths.head()

0    C:\Users\vasan\OneDrive\EmissionsDatathon\Data...
1    C:\Users\vasan\OneDrive\EmissionsDatathon\Data...
2    C:\Users\vasan\OneDrive\EmissionsDatathon\Data...
3    C:\Users\vasan\OneDrive\EmissionsDatathon\Data...
4    C:\Users\vasan\OneDrive\EmissionsDatathon\Data...
Name: Filepath, dtype: object

In [5]:
df_filepaths = filepaths.to_frame()

In [6]:
res = os.listdir(image_train)
df_filepaths['Name'] = pd.DataFrame (res, columns = ['FileName'])

In [7]:
df_filepaths

Unnamed: 0,Filepath,Name
0,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,ang20200708t192518-3_r1823_c108_ctr.tif
1,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,ang20200708t192518-3_r1823_c108_rgb.tif
2,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,ang20200708t200101-1_r4784_c237_ctr.tif
3,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,ang20200708t200101-1_r4784_c237_rgb.tif
4,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,ang20200708t200101-A_r502_c-24_ctr.tif
...,...,...
5047,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,GAO20210810t190554p0000-A_r3689_c612_rgb.tif
5048,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,GAO20210810t191216p0000-A_r234_c287_ctr.tif
5049,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,GAO20210810t191216p0000-A_r234_c287_rgb.tif
5050,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,GAO20210810t192906p0000-A_r4031_c958_ctr.tif


In [8]:
df_filepaths['candidate_id'] = [x.split('_')[-0] for x in df_filepaths['Name']]

In [9]:
df_filepaths = df_filepaths.drop(df_filepaths[df_filepaths['Name'].str.contains('rgb')].index)

In [10]:
df_filepaths.to_excel("all.xlsx")  

In [11]:
df_filepaths.shape

(2526, 3)

In [12]:
skiprows = 0
#Means read in the ',' as thousand seperator. Also drops all columns which are unnamed.
df = pd.read_csv("plume_attribution_2019-2022 Jeremy Zhao.csv", thousands=',', skiprows = skiprows)
#df = df.loc[:, ~df.columns.str.contains('^Unnamed')] 
df.head()

Unnamed: 0.1,Unnamed: 0,source_id,candidate_id,plume_lat,plume_lon,date,source_type,ipcc,qplume,sigma_qplume,time_of_detection,plume_tif,rgb_tif
0,0,A0001,GAO20210712t153850p0000-A,40.586588,-104.8233,2021-07-12 00:00:00,landfill,6A Solid Waste Disposal On Land,198.690714,56.089373,,,
1,1,A0002,GAO20210720t152559p0000-A,40.574605,-104.68813,2021-07-20 00:00:00,livestock,3A2 Manure Management,659.384534,215.976289,,,
2,2,A0002,GAO20210712t153850p0000-B,40.574387,-104.68855,2021-07-12 00:00:00,livestock,3A2 Manure Management,458.494606,167.196121,,,
3,3,A0002,GAO20210718t152410p0000-B,40.573827,-104.68812,2021-07-18 00:00:00,livestock,3A2 Manure Management,242.145473,140.004235,,,
4,4,A0003,GAO20210712t153850p0000-C,40.572121,-104.68871,2021-07-12 00:00:00,livestock,3A2 Manure Management,425.330806,171.220361,,,


In [13]:
df['candidate_id'].nunique()

7292

In [14]:
df.shape

(8752, 13)

In [15]:
dup = df[df['candidate_id'].duplicated() == True]

In [16]:
#dup.to_excel("dup.xlsx")  

In [17]:
#df.to_excel("df.xlsx")  

In [18]:
df = df.drop_duplicates(subset='candidate_id', keep="first")

In [19]:
print(df.shape)
print(df_filepaths.shape)

(7292, 13)
(2526, 3)


In [20]:
df2 = pd.merge(df, df_filepaths, on='candidate_id')

In [21]:
print(df2.shape)

(2525, 15)


In [22]:
df2.head()

Unnamed: 0.1,Unnamed: 0,source_id,candidate_id,plume_lat,plume_lon,date,source_type,ipcc,qplume,sigma_qplume,time_of_detection,plume_tif,rgb_tif,Filepath,Name
0,0,A0001,GAO20210712t153850p0000-A,40.586588,-104.8233,2021-07-12 00:00:00,landfill,6A Solid Waste Disposal On Land,198.690714,56.089373,,,,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,GAO20210712t153850p0000-A_r174_c3465_ctr.tif
1,1,A0002,GAO20210720t152559p0000-A,40.574605,-104.68813,2021-07-20 00:00:00,livestock,3A2 Manure Management,659.384534,215.976289,,,,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,GAO20210720t152559p0000-A_r380_c5956_ctr.tif
2,2,A0002,GAO20210712t153850p0000-B,40.574387,-104.68855,2021-07-12 00:00:00,livestock,3A2 Manure Management,458.494606,167.196121,,,,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,GAO20210712t153850p0000-B_r468_c6000_ctr.tif
3,3,A0002,GAO20210718t152410p0000-B,40.573827,-104.68812,2021-07-18 00:00:00,livestock,3A2 Manure Management,242.145473,140.004235,,,,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,GAO20210718t152410p0000-B_r406_c6016_ctr.tif
4,4,A0003,GAO20210712t153850p0000-C,40.572121,-104.68871,2021-07-12 00:00:00,livestock,3A2 Manure Management,425.330806,171.220361,,,,C:\Users\vasan\OneDrive\EmissionsDatathon\Data...,GAO20210712t153850p0000-C_r524_c5997_ctr.tif


In [23]:
df2['candidate_id'].nunique()

2525

In [None]:
df2=df2.dropna(subset=["qplume"])

In [None]:
df2.shape

In [None]:
print("Upper limit",df2['qplume'].mean() + 3*df2['qplume'].std())
print("Lower limit",df2['qplume'].mean() - 3*df2['qplume'].std())

In [None]:
df2 = df2[(df2['qplume'] < 2805.5444311201527)]
df2.shape

In [None]:
df2.to_excel("allfinal.xlsx")  

### Image Processing

In [None]:
def image_processing(fil):
#    Open image and import it as a numpy array
    image2 = imread(fil)
    
    feature_matrix = np.zeros((217,217))
    
    for i in range(0,image2.shape[0]):
        for j in range(0,image2.shape[1]):
            feature_matrix[i][j] = ((int(image2[i,j,0]) + int(image2[i,j,1]) + int(image2[i,j,2]))/3)
        
    features = np.reshape(feature_matrix, (217*217))
    
    return features

In [None]:
#Test Function
#fil = "C:/Users/vasan/OneDrive/EmissionsDatathon/Dataset/Data/Train/ang20190922t192642_ang20190922t192642-2_r4578_c217-plume.png"
#image_processing(fil)

In [None]:
image_list = []

In [None]:
for file in tqdm(df2['Filepath'].tolist()[:]):
    image_list.append(image_processing(file))

In [None]:
X = np.array(image_list)

In [None]:
X

In [None]:
np.save('processed_217x217_image.npy',X,allow_pickle=True)

In [None]:
mean = np.mean(X)
mean

### Preparing dataset for Model

In [None]:
Y = np.array(df2.qplume.tolist())

In [None]:
Y

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Split the train set into train and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=42)

In [None]:
img_rows, img_cols = 217, 217
input_shape = (img_rows, img_cols, 1)

X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
X_val = X_val.reshape(X_val.shape[0], img_rows, img_cols, 1)

X_val.shape

In [None]:
X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
X_val = X_val.reshape(X_val.shape[0], img_rows, img_cols, 1)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)
print(X_test.shape)
print(Y_test.shape)

### Model

In [None]:
#Define a Convolutional Neural Network Model

model = Sequential()

model.add(Conv2D(filters = 16, kernel_size = (3, 3), activation='relu',
                 input_shape = input_shape))
model.add(BatchNormalization())
model.add(Conv2D(filters = 16, kernel_size = (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(strides=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters = 32, kernel_size = (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(filters = 32, kernel_size = (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(strides=(2,2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.4))
#model.add(Dense(n_classes, activation='softmax'))
model.add(Dense(1, activation='relu'))
learning_rate = 0.001

model.compile(loss = 'mse',
              optimizer = Adam(learning_rate))

model.summary()

In [None]:
save_at = "model_regression2.hdf5"
save_best2 = ModelCheckpoint (save_at, monitor='val_accuracy', verbose=0, save_best_only=True, save_weights_only=False, mode='max')

In [None]:
np.isnan(X).any()

In [None]:
np.isnan(Y).any()

In [None]:
#Train the CNN

history = model.fit( X_train, Y_train, 
                    epochs = 15, batch_size = 100, 
                    callbacks=[save_best2], verbose=1, 
                   validation_data = (X_val, Y_val))

In [None]:
plt.figure(figsize=(6, 5))
plt.plot(history.history['loss'], color='r')
plt.plot(history.history['val_loss'], color='g')
plt.show()

### Predication

In [None]:
Y_pred = np.round(model.predict(X_test))

In [None]:
# Calculate the mean squared error
mse = mean_squared_error(Y_test, Y_pred)

# Calculate the root mean squared error
rmse = np.sqrt(mse)

print("Root Mean Squared Error:", rmse)

r2 = r2_score(Y_test, Y_pred)
print("Test R^2 Score: {:.5f}".format(r2))

### Submission

In [None]:
image_sub = []

In [None]:
image_test = Path('C:/Users/vasan/OneDrive/EmissionsDatathon/Dataset/Data/Test')

In [None]:
testpaths = pd.Series(list(image_test.glob(r'**/*.png')), name='Testpath').astype(str)

In [None]:
df_testpaths = testpaths.to_frame()

In [None]:
res = os.listdir(image_test)
df_testpaths['Name'] = pd.DataFrame (res, columns = ['TestName'])

In [None]:
df_testpaths

In [None]:
for file in tqdm(df_testpaths['Testpath'].tolist()[:]):
    image_sub.append(image_processing(file))

In [None]:
X_sub = np.array(image_sub)

In [None]:
np.save('sub_image.npy',X_sub,allow_pickle=True)

In [None]:
X_test

In [None]:
X_sub = X_sub.reshape(X_sub.shape[0], img_rows, img_cols, 1)
X_sub = X_sub.reshape(X_sub.shape[0], img_rows, img_cols, 1)
X_sub

In [None]:
Y_sub = np.round(model.predict(X_sub))

In [None]:
Y_sub

In [None]:
np.savetxt('07_Tifs_CNN_Outliers2.csv', Y_sub, delimiter=',')

In [None]:
X_sub.shape