### Table of contents
[Imports](#funcs)

[Parameters, filenames and directories](#params)

[Reading datasets](#readCatalogs)

[Training SOMs](#SOM_Train)

[Calculating SOM_info for dataset](#SOM_info)

[Building and saving maps](#SOM_maps)

[Adding outlier coefficients for specZ, ML and SED photoZ and quantErr](#outlCoeff)

[Saving datasets with SOM_info](#save)

In [29]:
### import libraries ###
import time
import numpy as np
import minisom
import os
import pandas as pd
from func_SOM_read_write import createSOM, writeSOM, loadSOM
from func_add_SOM_info import addBMUWeights, addQuantErr, addCellAddress, calcOutlCoeff, addOccupation
from func_maps import buildMaps, writeMaps,readMaps

### Parameters, filenames and directories
<a id='params'>#params<a>

In [30]:
### Parameters of SOM experiment ###
nameOfExperiment='ex023_Euclid'
width=25
height=28
widthBig=64
heightBig=67
num_features=10
epochs=6000
sigma=5
learning_rate=0.5
neighborhood_function='bubble'

In [43]:
### Files and directories ###
dirCatalogs=r'D:\Sources\COSMOS_photoZ\COSMOS_v3.0\catalogs\forPaper\v05\ForEuclid\SOMprepared'
dirSOM=r'D:\Sources\COSMOS_photoZ\COSMOS_v3.0\SOM\forArticle'
dirMLPQNA=r'D:\Sources\COSMOS_photoZ\COSMOS_v3.0\experimentsArticle\exp022'

nameTrain=r'04_COSMOS_SpectrZ_QfFilter.csv'
nameTest=r'04_COSMOS_SpectrZ_QfFilter_Test.csv'
nameRun=r'03_COSMOS2015_QFnoKB_MagLimHalf.csv'
nameDeimos=r'05_DEIMOS_QFnoKB_MagLimHalf.csv'

In [39]:
### Create experiment folder ###
if os.path.isdir(dirSOM):
    os.mkdir(os.path.join(dirSOM,nameOfExperiment))
else:
    os.makedirs(os.path.join(dirSOM,nameOfExperiment)) 
timeName=time.strftime("%y%m%d_%H%M%S_", time.gmtime())

In [40]:
### Write parameters on log file ###
logName=timeName+'_log.txt'
with open(os.path.join(dirSOM,nameOfExperiment,logName), 'a+') as f:
    f.write('Catalog='+nameTrain+'\n')
    f.write('width='+str(width)+'\n')
    f.write('height='+str(height)+'\n')
    f.write('widthBig='+str(widthBig)+'\n')
    f.write('heightBig='+str(heightBig)+'\n')
    f.write('num_features='+str(num_features)+'\n')
    f.write('epochs='+str(epochs)+'\n')
    f.write('sigma='+str(sigma)+'\n')
    f.write('learning rate='+str(learning_rate)+'\n')
    f.write('neighborhood_function='+str(neighborhood_function)+'\n')

In [41]:
### Column names for training and filtering ###
mags=['Ksmagap3','Ymagap3','Hmagap3','Jmagap3','Bmagap3','Vmagap3','ipmagap3','rmagap3','umagap3','zppmagap3']
magsScaled=['sc_'+s for s in mags]
idCol='Seq'
specZ='specZ'
photoZ_ML='photoZ_ML'
photoZ_SED='photoZ_SED'
residML='resid_ML'
residSED='resid_SED'
residML_SED='residML_SED'
quantErr='quantErr'

### Reading datasets
<a id='readCatalogs'>#readCatalogs<a>

In [46]:
### Read catalogs ###
dataTrain=pd.read_csv(os.path.join(dirCatalogs,nameTrain))
dataTest=pd.read_csv(os.path.join(dirCatalogs,nameTest))
dataRun=pd.read_csv(os.path.join(dirCatalogs,nameRun))
dataDeimos=pd.read_csv(os.path.join(dirCatalogs,nameDeimos))

In [47]:
dataRun

Unnamed: 0,Seq,photoZ_ML,RAJ2000,DEJ2000,Ksmagap3,Ymagap3,Hmagap3,Jmagap3,Bmagap3,Vmagap3,...,sc_Ymagap3,sc_Hmagap3,sc_Jmagap3,sc_Bmagap3,sc_Vmagap3,sc_ipmagap3,sc_rmagap3,sc_umagap3,sc_zppmagap3,residML_SED
0,220670,0.396449,149.971258,1.615312,22.3456,23.5101,22.8016,23.2833,25.1657,24.0523,...,0.651865,0.638066,0.538012,0.701438,0.648406,0.676219,0.647998,0.872315,0.687985,0.678682
1,221608,1.014050,149.970113,1.616419,21.3686,21.9566,21.7029,21.7964,23.8352,23.2809,...,0.486358,0.518552,0.408260,0.542747,0.559347,0.562259,0.573620,0.490329,0.521703,0.003932
2,220921,0.336488,149.967881,1.616578,23.7668,23.3297,23.8549,23.4956,24.1986,23.6693,...,0.632645,0.752641,0.556538,0.586091,0.604189,0.660277,0.620353,0.547127,0.672768,0.138861
3,221761,0.473613,149.968147,1.616807,21.9670,22.0555,22.1155,22.1044,23.3971,22.8843,...,0.496894,0.563433,0.435137,0.490494,0.513560,0.547096,0.517356,0.428564,0.558668,0.051729
4,221071,0.230568,149.965476,1.615919,22.3136,22.4689,22.4339,22.4388,23.6958,22.8696,...,0.540937,0.598068,0.464318,0.526121,0.511863,0.564790,0.535716,0.531528,0.565960,-0.007836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260677,989983,,149.452796,2.809922,24.7207,25.5497,25.2351,25.5254,26.4317,26.2508,...,0.869160,0.902775,0.733664,0.852437,0.902225,0.995897,0.926198,0.868173,0.915839,
260678,991002,,149.468440,2.810375,21.7339,24.9744,22.5880,23.4884,26.5244,26.1060,...,0.807869,0.614831,0.555909,0.863493,0.885507,0.905724,0.901875,0.851941,0.884172,
260679,990659,,149.471347,2.811030,24.8017,25.4291,24.1432,24.5163,26.2217,25.5876,...,0.856312,0.784001,0.645607,0.827390,0.825658,0.928685,0.887952,0.878830,0.926875,
260680,991869,,149.472103,2.812693,24.1457,24.9160,25.0463,25.1223,25.4838,25.3493,...,0.801647,0.882238,0.698489,0.739379,0.798146,0.870768,0.837874,0.713727,0.834604,


### Training SOMs
<a id='SOM_Train'>#SOM_Train<a>

In [48]:
data=np.array(dataTrain[magsScaled])
somTrain=createSOM(data, epochs, height, width, num_features, sigma, learning_rate, 
                neighborhood_function, saveWeightsName=os.path.join(dirSOM,nameOfExperiment,'somTrainWeights.txt'))

In [49]:
data=np.array(dataRun[magsScaled])
somRun=createSOM(data, epochs, height, width, num_features, sigma, learning_rate, 
                neighborhood_function, saveWeightsName=os.path.join(dirSOM,nameOfExperiment,'somRunWeights.txt'))

In [50]:
somRunBig=createSOM(data, 10*epochs, widthBig, widthBig, num_features, sigma, learning_rate, 
                neighborhood_function, saveWeightsName=os.path.join(dirSOM,nameOfExperiment,'somRunBigWeights.txt'))

In [51]:
### Checking that after writing/loading cycle SOM weights were the same ###
soms=[somTrain,somRun,somRunBig]
somNames=['somTrainWeights.txt','somRunWeights.txt','somRunBigWeights.txt']
for som,somName in zip(soms,somNames):
    somCheck=loadSOM(weightsFile=os.path.join(dirSOM,nameOfExperiment,somName),
                 sigma=sigma,learning_rate=learning_rate,neighborhood_function=neighborhood_function,random_seed=10)
    print(abs((som.get_weights()-somCheck.get_weights())).max())

4.99711383383783e-13
4.997668945350142e-13
4.999889391399392e-13


### Calculating SOM_info for dataset
<a id='SOM_info'>#SOM_info<a>

In [52]:
### Adding weights of BMU for each galaxy in a dataset ###
dataTrain=addBMUWeights(somTrain,dataTrain,magsScaled)
dataTest=addBMUWeights(somTrain,dataTest,magsScaled)
dataDeimos=addBMUWeights(somTrain,dataDeimos,magsScaled)
dataRun=addBMUWeights(somTrain,dataRun,magsScaled)

In [53]:
### Adding quantization errors for each galaxy ###
dataTrain=addQuantErr(dataTrain,magsScaled,colWeights=None)
dataTest=addQuantErr(dataTest,magsScaled,colWeights=None)
dataDeimos=addQuantErr(dataDeimos,magsScaled,colWeights=None)
dataRun=addQuantErr(dataRun,magsScaled,colWeights=None)

In [54]:
### Adding cellIDs for each galaxy ###
soms=[somTrain,somRun,somRunBig]
prefixes=['_TrainSOM','_RunSOM','_RunBigSOM']
for som,prefix in zip(soms,prefixes):
    dataTrain=addCellAddress(som, dataTrain, magsScaled, idCol,cellIDPrefix=prefix)
    dataTest=addCellAddress(som, dataTest, magsScaled, idCol,cellIDPrefix=prefix)
    dataDeimos=addCellAddress(som, dataDeimos, magsScaled, idCol,cellIDPrefix=prefix)
    dataRun=addCellAddress(som, dataRun, magsScaled, idCol,cellIDPrefix=prefix)

### Building and saving maps
<a id='SOM_maps'>#SOM_maps<a>

In [55]:
mapsAll={}
    
#mapCols=[photoZ_SED,photoZ_ML,specZ,residSED,residML,residML_SED,'quantErr']
mapCols=[photoZ_SED,specZ,residSED,'quantErr']

cellID='cellID_TrainSOM'
som=somTrain
mapsAll['mapsTrainOnTrain']=buildMaps(som,dataTrain,mapCols,idCol,cellID=cellID)
mapsAll['mapsDeimosOnTrain']=buildMaps(som,dataDeimos,mapCols,idCol,cellID=cellID)
mapsAll['mapsTestOnTrain']=buildMaps(som,dataTest,mapCols,idCol,cellID=cellID)

cellID='cellID_RunSOM'
som=somRun
mapsAll['mapsTrainOnRun']=buildMaps(som,dataTrain,mapCols,idCol,cellID=cellID)
mapsAll['mapsDeimosOnRun']=buildMaps(som,dataDeimos,mapCols,idCol,cellID=cellID)
mapsAll['mapsTestOnRun']=buildMaps(som,dataTest,mapCols,idCol,cellID=cellID)

cellID='cellID_RunBigSOM'
som=somRunBig
mapsAll['mapsTrainOnRunBig']=buildMaps(som,dataTrain,mapCols,idCol,cellID=cellID)
mapsAll['mapsDeimosOnRunBig']=buildMaps(som,dataDeimos,mapCols,idCol,cellID=cellID)
mapsAll['mapsTestOnRunBig']=buildMaps(som,dataTest,mapCols,idCol,cellID=cellID)

mapCols=[photoZ_SED,photoZ_ML,residML_SED,'quantErr']
cellID='cellID_TrainSOM'
som=somTrain
mapsAll['mapsRunOnTrain']=buildMaps(som,dataRun,mapCols,idCol,cellID=cellID)

cellID='cellID_RunSOM'
som=somRun
mapsAll['mapsRunOnRun']=buildMaps(som,dataRun,mapCols,idCol,cellID=cellID)

cellID='cellID_RunBigSOM'
som=somRunBig
mapsAll['mapsRunOnRunBig']=buildMaps(som,dataRun,mapCols,idCol,cellID=cellID)

In [56]:
### Saving maps ###
dirMaps='maps'
os.mkdir(os.path.join(dirSOM,nameOfExperiment,dirMaps))
for key,val in mapsAll.items():
    writeMaps(val,prefixName=os.path.join(dirSOM,nameOfExperiment,dirMaps,key))

### Adding outlier coefficients for specZ, ML and SED photoZ and quantErr
<a id='outlCoeff'>#outlCoeff<a>

In [58]:
filterCols=[specZ,photoZ_SED,quantErr]
maps=mapsAll['mapsTrainOnTrain']
cellID='cellID_TrainSOM'
for col in filterCols:
    dataTrain=calcOutlCoeff(dataTrain,maps['stdMaps'][col],maps['meanMaps'][col],col,cellID,prefix='')
    dataTest=calcOutlCoeff(dataTest,maps['stdMaps'][col],maps['meanMaps'][col],col,cellID,prefix='')
    dataDeimos=calcOutlCoeff(dataDeimos,maps['stdMaps'][col],maps['meanMaps'][col],col,cellID,prefix='')

In [59]:
filterCols=[photoZ_SED,'quantErr']
maps=mapsAll['mapsTrainOnTrain']
cellID='cellID_TrainSOM'
for col in filterCols:
    dataRun=calcOutlCoeff(dataRun,maps['stdMaps'][col],maps['meanMaps'][col],col,cellID,prefix='')

### Adding occupation info
<a id='addOccupation'>#addOccupation<a>

In [60]:
cellID='cellID_RunBigSOM'
activMap=mapsAll['mapsTrainOnRunBig']['activMap']
datasets=[dataTrain,dataTest,dataDeimos,dataRun]
for dataset in datasets:
    dataset=addOccupation(dataset,cellID,activMap)

### Saving datasets with SOM_info
<a id='save'>#save<a>

In [61]:
### Saving datasets with added data ###
dirDatasets='datasets'
os.mkdir(os.path.join(dirSOM,nameOfExperiment,dirDatasets))
datasets=[dataTrain,dataTest,dataDeimos,dataRun]
names=[nameTrain,nameTest,nameDeimos,nameRun]
for dataset,name in zip(datasets,names):
    dataset.to_csv(os.path.join(dirSOM,nameOfExperiment,dirDatasets,name.replace('.csv','_SOMinfo.csv')),index=False)