In [2]:
import os
import sys
import math
import numpy as np
import pandas as pd
from itertools import islice

import networkx as nx

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.util import rddToFileName, TransformFunction

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans

%matplotlib widget

import matplotlib.pyplot as plt

In [3]:
SparkContext.setSystemProperty('spark.executor.memory', '52g')
SparkContext.setSystemProperty('spark.app.name', 'stars')
ssc = StreamingContext(sc, 1) 
spark

In [4]:
Gt = nx.Graph()
directed_Gt = nx.DiGraph()

In [5]:
batchsize = 10000

#batch_file_folder =  "/common/home/sdb202/project/temp/"
batch_file_folder =  "/common/home/milky-way/temp2/"
output_file_folder = "/common/home/milky-way/temp/"

In [6]:
def map_for_vector(line):
    vector = []
    for x in line.split(','):
        try:
            if x is None or x == '':
                vector.append(0)
            else:
                vector.append(float(x))
        except:
            vector.append(0)
    return vector


def node_filter(n) :
    
    if n[0][0] >= n[1][0] : # filter out redundant pairs
        return False
    
    ra_1 = n[0][2]
    ra_2 = n[1][2]
    
    d_1 = n[0][3]
    d_2 = n[1][3]

    # 𝛾≈ sqrt([(𝛼𝑎−𝛼𝑏)cos((𝛿𝑎 + 𝛿𝑏) / 2)]2+(𝛿𝑎−𝛿𝑏)2)
    ra_diff = (ra_1 - ra_2)
    d_diff = (d_1 - d_2)
    d_avg = (d_1 + d_2) / 2
    
    distance = math.sqrt( ((ra_diff * math.cos(d_avg)) ** 2) + (d_diff ** 2) )
    if distance < 0.0001: # filter with distance
        
        return True
    
    else :
        return False
       
    
def add_edges(n):
    Gt.add_edge(n[0][0], n[1][0]) # adding edge from A to B
    print("--- Adding edge: (", n[0][0], n[1][0], ") ---")
    return (n[0][0], 1) # return A, 1 : meaning, the A has a neighbor


def takeAndPrint(time, rdd):
    print("--" + str(time) + "--\n")
    try:
        taken = rdd.collect()

        for record in taken:
            with open(output_file_folder + str(time) + ".txt", "a") as myfile:
                myfile.write(str(record) + "\n")
    
    except Exception as e:
        print("Got exception: " + str(e))

In [7]:
star_tile_batches = ssc.textFileStream(batch_file_folder)\
                        .mapPartitionsWithIndex(lambda idx, it: islice(it, 1, None) if idx == 0 else it)\
                        .map(map_for_vector)
                        #.window(1000,1000)

In [8]:
star_pairs = star_tile_batches.transform(lambda rdd: rdd.cartesian(rdd))
filtered_star_pairs = star_pairs.filter(node_filter)
filtered_star_pairs.foreachRDD(takeAndPrint)

In [9]:
ssc.start()

In [10]:
ssc.awaitTerminationOrTimeout(80000)

--2020-12-14 18:50:42--

--2020-12-14 18:50:43--

--2020-12-14 18:50:44--

--2020-12-14 18:50:45--

--2020-12-14 18:50:46--

--2020-12-14 18:50:47--

--2020-12-14 18:50:48--

--2020-12-14 18:50:49--

--2020-12-14 18:50:50--

--2020-12-14 18:50:51--

--2020-12-14 18:50:52--

--2020-12-14 18:50:53--

--2020-12-14 18:50:54--

--2020-12-14 18:50:55--

--2020-12-14 18:50:56--

--2020-12-14 18:50:57--

--2020-12-14 18:50:58--

--2020-12-14 18:50:59--

--2020-12-14 18:51:00--

--2020-12-14 18:51:01--

--2020-12-14 18:51:02--

--2020-12-14 18:51:03--

--2020-12-14 18:51:04--

--2020-12-14 18:51:05--

--2020-12-14 18:51:06--

--2020-12-14 18:51:07--

--2020-12-14 18:51:08--

--2020-12-14 18:51:09--

--2020-12-14 18:51:10--

--2020-12-14 18:51:11--

--2020-12-14 18:51:12--

--2020-12-14 18:51:13--

--2020-12-14 18:51:14--

--2020-12-14 18:51:15--

--2020-12-14 18:51:16--

--2020-12-14 18:51:17--

--2020-12-14 18:51:18--

--2020-12-14 18:51:19--

--2020-12-14 18:51:20--

--2020-12-14 18:51:21--



KeyboardInterrupt: 

In [None]:
ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [None]:
# os.environ['PYSPARK_PYTHON'] = '/koko/system/anaconda/envs/python38/bin/python3'
# print(os.environ['PYSPARK_PYTHON'] )

# os.environ['PYSPARK_DRIVER_PYTHON'] = '/koko/system/anaconda/envs/python38/bin/python3'
# print(os.environ['PYSPARK_DRIVER_PYTHON'] )
