In [1]:
from Swarm import Swarm
import Helper

import sys
import copy
import time
import argparse
import os

import numpy as np
from scipy import stats

from multiprocessing import Pool
from multiprocessing import cpu_count
from itertools import repeat
from tqdm import tqdm

from Helper import Write_Log

In [2]:
def lossFunction(tar,b,lossFunc = 0):
    if (lossFunc == 1):
        newCost = np.sum( (b-tar)**2, axis=1 )/len(b)#MSE
    elif(lossFunc == 2):
        newCost = np.sqrt(np.sum( (b-tar)**2, axis=1 ))#RMSE
    elif(lossFunc == 3):
        #Heuber
        delta = 0.1
        y = tar
        yHat = b
        newCost = np.sum(np.where(np.abs(y-yHat) < delta,.5*(y-yHat)**2 , delta*(np.abs(y-yHat)-0.5*delta)))
    else :
        newCost = np.sum( (b-tar)**2)#SSE
    return newCost

In [3]:
# Prints statistics of the current swarm
def Print_Stats(swarm, contact, pointCount, i, outFilePtr, convFact):
    pers = stats.pearsonr(swarm.gBest[2], contact[:,3])
    spear = stats.spearmanr(swarm.gBest[2], contact[:,3])
    spearIF = stats.spearmanr(swarm.gBest[2], contact[:,2])

    error = np.sqrt( (1/pointCount) * np.sum( (swarm.gBest[2]-contact[:,3])**2 ) )

    #print('id: ' + str(swarm.id) + 
    #    ' itt: ' + str(i) + 
    #    ' Cost: ' + str(swarm.gBest[1]) + 
    #    ' Pearson: ' + str(pers[0]) + 
    #    ' Spearmen: ' + str(spear[0]) +
    #    ' IFSpear: ' + str(spearIF[0]) +
    #    ' error: ' + str(error))
    thisOutFilePtr = 'outputFolder/'+outFilePtr +str(convFact)
    

def Write_Stats(swarm, contact, outFilePtr):
    Helper.Write_Output(outFilePtr, swarm.gBest[0])

# Performs one operation and prints statistics of current swarm
def One_Move(ittCount, swarm, contact, pointCount, threshold,  outFilePtr, convFact):
    saveGBestCost = float('inf')
    totTime = 0


    for i in range(ittCount):
        if (i%1000 == 0) and (swarm.gBest is not None):
            #error = np.sqrt( (1/pointCount) * np.sum( (swarm.gBest[2]-contact[:,3])**2 ) )
            error = lossFunction(contact[:,3],swarm.gBest[2])#np.sum( (swarm.gBest[2]-contact[:,3])**2 )
            Print_Stats(swarm, contact, pointCount, i, outFilePtr, convFact)
            
                

            if (np.abs(saveGBestCost - error)) >= threshold:
                saveGBestCost = error
            else:
                return i, totTime

        operation(i, swarm)


    return i

# Performs a single PSO pass: Velocity calculation, update position, get new cost
def operation(i, swarm):
    swarm.Calc_Vel(ittCount,i)
    swarm.Update_Pos(i)
    swarm.Cost()

# Optimizes single swarm
def Optimize(inFilePtr, outFilePtr, convFact,constraint,points,zeroInd,lossFunc = 0):
    dist = 1.0 / (constraint[:,2]**convFact)
    constraint = np.insert(constraint,3, dist ,axis=1)
    
    swarm = Swarm(constraint, len(points), randVal=randRange, swarmSize=swarmSize, zeroInd=zeroInd,lossFunc = lossFunc )

    ittFin = One_Move(ittCount, swarm, constraint, len(points), threshold,  outFilePtr, convFact)
    
    #pbar.update(1)
    return (stats.pearsonr(swarm.gBest[2], constraint[:,3])[0], 
                    stats.spearmanr(swarm.gBest[2], constraint[:,3])[0], 
                    lossFunction(constraint[:,3],swarm.gBest[2]),
                    ittFin,
                     swarm.id, swarm)

# Runs in paralel if passed multiple rangeSpace
def Par_Choice(inFilePtr, outFilePtr, alpha,lossFunc = 0):
    contact, points, zeroInd = Helper.Read_Data(inFilePtr, alpha)
    
    bestSwarm = None
    if 1==1:
        convStore = []
        alphas = np.array(range(int(alpha[0]),int(alpha[1]),int(alpha[2])))/100
        pool = Pool(processes=PROC_COUNT)
        #pbar = tqdm(total=len(alphas))#progress bar
        swarms = pool.starmap(Optimize,  zip(repeat(inFilePtr), repeat(outFilePtr), 
                                             alphas, 
                                             repeat(contact), repeat(points), repeat(zeroInd), repeat(lossFunc)))

        pool.close()
        pool.join()

        #swarms = sorted(swarms, key=lambda x: x[1])
        
        iforapl = 0
        for swarm in swarms:
            print(str(swarm[-1]) + ' ' + str(swarm[1]))
            contact = np.insert(contact,3, 1.0 / (contact[:,2]**alphas[iforapl]) ,axis=1)
            thisOutFile = 'outputFolder/'+outFilePtr+"_alpha_"+str(alphas[iforapl])
            
            Write_Stats(swarm[len(swarm)-1], contact, thisOutFile)
            convStore.append(swarm)
            if (bestSwarm is None) or (swarm[1] > bestSwarm[1]):
                bestSwarm = swarm
                swarmForPDB = swarm[len(swarm)-1]
                bestAlpha = alphas[iforapl]
            iforapl += 1
    else:#single thread
        bestSwarm = Optimize( inFilePtr, outFilePtr, alpha)
    contact = np.insert(contact,3, 1.0 / (contact[:,2]**bestAlpha) ,axis=1)
    outFilePtr = 'outputFolder/'+outFilePtr+"_best"
    print(bestSwarm)
    Write_Stats(swarmForPDB, contact, outFilePtr)

    return bestSwarm

def Full_List( inputFilePtr, outFilePtr , alpha, lossFunc = 0):
    convStore = []
    
    convStore.append(Par_Choice( inputFilePtr, outFilePtr, alpha, lossFunc))
    print("pearson:" + str(convStore[0][0]) + " spearman:"+
          str(convStore[0][1]) + " rmse:" + str(convStore[0][2]))

    #Helper.Write_List(convStore, outFilePtr)
    return convStore

In [4]:
sys.setrecursionlimit(10000)
PROC_COUNT = cpu_count()

rangeSpace = [] # Max scaling factor. Needs to be optimized for each specific dataset. Use two values [one, two] to multithread through a range of those two values at a interval of 5000


# Arguments for running program
# python3 ParticleChromo3D.py <input_data> <other_parameter>

randRange = 1.0
swarmSize = 15
ittCount = 30000
threshold = 0.000001

if len(rangeSpace) == 0:
    rangeSpace.append(20000)

if len(rangeSpace) > 2 and (rangeSpace[0] == rangeSpace[1]):
    rangeSpace.pop()
    
if not os.path.exists('outputFolder'):
    os.makedirs('outputFolder')



theseAlphas = np.array([0.1, 2.0, 0.2])*100
theAlphas = np.array(range(int(theseAlphas[0]),int(theseAlphas[1]),int(theseAlphas[2])))/100



In [9]:
#Gm12878 tests
lossFunctionChoice = 0 #0 = sse 1 == MSE 2 = rmse 3 == Huber

nettimes = []
allSpears = [] 
for i in range(23):
    chromosomeRunning = i+1
    #1mb
    inFilePtr = '../input-and-models/Input/GM12878_input/KR_1mb/chr'+str(chromosomeRunning)+'_matrix.txt'
    #500kb
    #inFilePtr = '../input-and-models/Input/GM12878_input/KR_500kb/chr'+str(chromosomeRunning)+'_matrix.txt'
    outFilePtr = './lossFunc/chr'+str(chromosomeRunning)

    print(inFilePtr)

    fout = inFilePtr + ".stripped"
    clean_lines = []
    f= open(inFilePtr, "r")
    lines = f.readlines()
    for l in lines:
        res = str(" ".join(l.split()))
        clean_lines.append(res)
    f.close()

    with open(fout, "w") as f:
        f.writelines('\n'.join(clean_lines))
    f.close()


    start = time.time()
    print("Seconds since epoch =", start)
    outputOfSwarm = Full_List( inFilePtr+".stripped", outFilePtr, theseAlphas, lossFunctionChoice)[0]
    print(outputOfSwarm)

    bestSpearm = outputOfSwarm[1]
    bestCost = outputOfSwarm[2]
    bestAlpha = theAlphas[outputOfSwarm[4]]
    bestPearsonRHO = outputOfSwarm[0]


    print("Input file: ", inFilePtr)
    print("Convert factor:: ",bestAlpha)
    print("SSE at best spearman : ", bestCost)    
    print("Best Spearman correlation Dist vs. Reconstructed Dist  : ", bestSpearm) 
    print("Best Pearson correlation Dist vs. Reconstructed Dist: ", bestPearsonRHO) 
    Write_Log("outputFolder/bestAlpha.log", inFilePtr, bestAlpha, bestCost, bestSpearm, bestPearsonRHO)
    net = time.time() - start
    allSpears.append(bestSpearm)
    nettimes.append(net)
    print("time : ", net)
    
    print("All spears : ", allSpears)
    

../input-and-models/Input/GM12878_input/KR_1mb/chr1_matrix.txt
Seconds since epoch = 1619120024.3756342
<Swarm.Swarm object at 0x7f18e212a400> 0.8573973187699042
<Swarm.Swarm object at 0x7f18e212a6a0> 0.9521523660699509
<Swarm.Swarm object at 0x7f18e212ab20> 0.9458222472368295
<Swarm.Swarm object at 0x7f18e212a280> 0.9045852615310793
<Swarm.Swarm object at 0x7f18e212a5b0> 0.8523306654866163
<Swarm.Swarm object at 0x7f18e212a610> 0.598444334233421
<Swarm.Swarm object at 0x7f18e212aa00> 0.17688266359190125
<Swarm.Swarm object at 0x7f18e212aa30> 0.08110163006534014
<Swarm.Swarm object at 0x7f18e212ac70> 0.006790852750826738
<Swarm.Swarm object at 0x7f18e213e1f0> 0.018693143938792605
(0.9425918848282379, 0.9521523660699509, 10.752211287167404, 29999, 1, <Swarm.Swarm object at 0x7f18e212a6a0>)
pearson:0.9425918848282379 spearman:0.9521523660699509 rmse:10.752211287167404
(0.9425918848282379, 0.9521523660699509, 10.752211287167404, 29999, 1, <Swarm.Swarm object at 0x7f18e212a6a0>)
Input file

In [10]:
for i in allSpears:
    print(i)
    
print("Now times")
for i in nettimes:
    print(i)

0.9521523660699509
0.9514230532932451
0.9610229440479183
0.9773926883625426
0.9722437552731868
0.9558532164075412
0.9376857437937869
0.9534576278559531
0.9501577734446012
0.9521615142315266
0.9566894875822718
0.9546079641450866
0.9607251049273743
0.9496579094582492
0.9504718395403087
0.9318021225262682
0.9354824159487267
0.9315077408760541
0.9392267847258806
0.9422963688104998
0.9626299790773474
0.9491184455107268
0.9780755843837936
Now times
303.1542954444885
344.3154311180115
188.94543433189392
175.25258612632751
146.38338112831116
123.04706406593323
98.02168703079224
64.45956301689148
50.52085518836975
46.21173167228699
45.44349813461304
43.97530913352966
25.978116512298584
23.245100021362305
21.665141105651855
21.16706919670105
20.73592734336853
19.839895009994507
13.773454904556274
14.638070583343506
10.502585887908936
9.809245347976685
79.83802843093872


In [5]:
#Synthetic data set
lossFunctionChoice = 3 #0 = sse 1 == MSE 2 = rmse 3 == Huber

allSpears = [] 

inFilePtr = '../input-and-models/Input/Synthetic/chainDres5_Matrix_noise000.txt'
#inFilePtr = '../input-and-models/Input/GM12878_input/KR_1mb/chr'+str(chromosomeRunning)+'_matrix.txt'
outFilePtr = './lossFunc/synth_loss_mse'

print(inFilePtr)

fout = inFilePtr + ".stripped"
clean_lines = []
f= open(inFilePtr, "r")
lines = f.readlines()
for l in lines:
    res = str(" ".join(l.split()))
    clean_lines.append(res)
f.close()

with open(fout, "w") as f:
    f.writelines('\n'.join(clean_lines))
f.close()


start = time.time()
print("Seconds since epoch =", start)
outputOfSwarm = Full_List( inFilePtr+".stripped", outFilePtr, theseAlphas, lossFunctionChoice)[0]
print(outputOfSwarm)

bestSpearm = outputOfSwarm[1]
bestCost = outputOfSwarm[2]
bestAlpha = theAlphas[outputOfSwarm[4]]
bestPearsonRHO = outputOfSwarm[0]


print("Input file: ", inFilePtr)
print("Convert factor:: ",bestAlpha)
print("SSE at best spearman : ", bestCost)    
print("Best Spearman correlation Dist vs. Reconstructed Dist  : ", bestSpearm) 
print("Best Pearson correlation Dist vs. Reconstructed Dist: ", bestPearsonRHO) 
Write_Log("outputFolder/bestAlpha.log", inFilePtr, bestAlpha, bestCost, bestSpearm, bestPearsonRHO)
net = time.time() - start
allSpears.append(bestSpearm)
print("time : ", net)

print("All spears : ", allSpears)


../input-and-models/Input/Synthetic/chainDres5_Matrix_noise000.txt
Seconds since epoch = 1618803542.375461
<Swarm.Swarm object at 0x7f9d7bb5ea30> 0.7278820998177046
<Swarm.Swarm object at 0x7f9d7bb5e880> 0.9040950467201744
<Swarm.Swarm object at 0x7f9d7bb5ea60> 0.9648020862716763
<Swarm.Swarm object at 0x7f9d7bb5eaf0> 0.9914152815601925
<Swarm.Swarm object at 0x7f9d7bb5e8b0> 0.9938711468805349
<Swarm.Swarm object at 0x7f9d7bb5e100> 0.9982966346771843
<Swarm.Swarm object at 0x7f9d7bb5e910> 0.9929652104022952
<Swarm.Swarm object at 0x7f9d7bb5e940> 0.9835973748822087
<Swarm.Swarm object at 0x7f9d7bb5eac0> 0.9716995627372357
<Swarm.Swarm object at 0x7f9d7bb65760> 0.9592087686273185
(0.9983742471323297, 0.9982966346771843, 257.0658755574808, 29999, 1, <Swarm.Swarm object at 0x7f9d7bb5e100>)
pearson:0.9983742471323297 spearman:0.9982966346771843 rmse:257.0658755574808
(0.9983742471323297, 0.9982966346771843, 257.0658755574808, 29999, 1, <Swarm.Swarm object at 0x7f9d7bb5e100>)
Input file:  ..