In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
print(os.getenv("JAVA_HOME"))
# Create SparkSession
spark = SparkSession.builder\
    .config("spark.driver.memory", "16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.0.0,org.neo4j:neo4j-connector-apache-spark_2.12:4.1.2_for_spark_3")\
    .getOrCreate()

/usr/lib/jvm/java-11-openjdk-amd64


22/06/23 08:08:54 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.178 instead (on interface enp4s0)
22/06/23 08:08:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/vergenter/Venvs/sparkNLPVenv/lib/python3.10/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/vergenter/.ivy2/cache
The jars for the packages stored in: /home/vergenter/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-fe81672e-eb6a-4169-8284-f3d4d0760cf6;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.0.0 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	fo

In [2]:
from pyspark import StorageLevel
articles_abstracts = spark.read.format("org.neo4j.spark.DataSource")\
  .option("url", "bolt://192.168.0.178:7687")\
  .option("authentication.basic.username", os.environ["NEO4J_LOGIN"])\
  .option("authentication.basic.password", os.environ["NEO4J_PASSWORD"]).option("query", "MATCH (n:Article) where n.language =\"en\" WITH n RETURN n.id as id,n.year as year,n.title +'. '+ n.abstract as text")\
  .option("partitions", "4")\
  .load()
articles_abstracts.persist(StorageLevel.DISK_ONLY)

DataFrame[id: bigint, year: bigint, text: string]

In [3]:
from sparknlp.annotator import PerceptronModel,SentenceDetector,Tokenizer,Stemmer,Normalizer,StopWordsCleaner
from sparknlp.base import DocumentAssembler,Pipeline,LightPipeline

document_assembler = DocumentAssembler() \
  .setInputCol("text") \
  .setOutputCol("document")

sentence = SentenceDetector() \
    .setInputCols("document") \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

stemmer = Stemmer() \
  .setInputCols(["token"]) \
  .setOutputCol("stem")

norm = Normalizer()\
  .setInputCols(["token"])\
  .setOutputCol("normalized")\
  .setLowercase(True)

stops = StopWordsCleaner.pretrained()\
  .setInputCols("normalized")\
  .setOutputCol("cleanedStem")


stem_pipeline = Pipeline(stages=[
  document_assembler,
  sentence,
  tokenizer,
  stemmer,
  norm,
  stops,
])

empty_df = spark.createDataFrame([[""]]).toDF('text')
stem_model = stem_pipeline.fit(empty_df)

stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[ | ]stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[ / ]Download done! Loading the resource.
[OK!]


In [4]:
with_stems = stem_model.transform(articles_abstracts).select("year","cleanedStem.result")
with_stems.persist(StorageLevel.DISK_ONLY)

DataFrame[year: bigint, result: array<string>]

In [5]:

from pyspark.sql.types import ArrayType,StructType,StructField,StringType

def get_sliding_window(window_size:int):
    def window(arr:list):
        arrlen = len(arr)
        size = min(arrlen*(arrlen-1)//2,window_size*(window_size-1)//2) + max(0,(arrlen-window_size))*(window_size-1)
        result = [None] * size
        index = 0
        for i in range(size):
            for j in range(i+1,min(i+window_size,arrlen)):
                result[index] = (arr[i],arr[j]) if (arr[i]>arr[j]) else (arr[j],arr[i])
                index+=1
        return result
    return F.udf(window,ArrayType(StructType([StructField("first",StringType(),False),StructField("second",StringType(),False)])))

mySlidingPairs2 = get_sliding_window(7)
coocurence = with_stems.select("year",F.explode(mySlidingPairs2("result")).alias("result")).select("year","result.first","result.second").groupBy("year","first","second").count()
coocurence.persist(StorageLevel.DISK_ONLY)

DataFrame[year: bigint, first: string, second: string, count: bigint]

In [7]:
coocurence.head(1)

[Row(year=2013, first='manipulate', second='ability', count=1)]

In [10]:
coocurence.head(1)

[Row(first='manipulate', second='ability', inverseWeight=1.0)]

In [25]:
coocurence_all_years = coocurence.groupBy("first","second").agg(F.sum("count").alias("count"))
coocurence_all_years.persist(StorageLevel.DISK_ONLY)
coocurence_all_years.head(1)

22/06/23 09:12:17 WARN CacheManager: Asked to cache already cached data.


[Row(first='ratio', second='close', count=55)]

In [26]:
weight_count_from_weight_all_years = {
"all_count":coocurence_all_years.count(),
">1 weight":coocurence_all_years.filter(F.col("count")>1).count(),
">2 weight":coocurence_all_years.filter(F.col("count")>2).count(),
">4 weight":coocurence_all_years.filter(F.col("count")>4).count(),
">8 weight":coocurence_all_years.filter(F.col("count")>8).count(),
">16 weight":coocurence_all_years.filter(F.col("count")>16).count(),
}
weight_count_from_weight_all_years

                                                                                

{'all_count': 20331332,
 '>1 weight': 8160322,
 '>2 weight': 4946966,
 '>4 weight': 2844500,
 '>8 weight': 1549646,
 '>16 weight': 794937}

In [11]:
coocurence_2013 =coocurence.filter(F.col("year")==2013).select("first","second",(1/F.col("count")).alias("inverseWeight"))

In [84]:
weight_count_from_weight = {
"all_count":coocurence_2013.count(),
">1 weight":coocurence_2013.filter(F.col("inverseWeight")<1).count(),
">2 weight":coocurence_2013.filter(F.col("inverseWeight")<(1/2)).count(),
">4 weight":coocurence_2013.filter(F.col("inverseWeight")<(1/4)).count(),
">8 weight":coocurence_2013.filter(F.col("inverseWeight")<(1/8)).count(),
">16 weight":coocurence_2013.filter(F.col("inverseWeight")<(1/16)).count(),
">32 weight":coocurence_2013.filter(F.col("inverseWeight")<(1/32)).count(),
">64 weight":coocurence_2013.filter(F.col("inverseWeight")<(1/64)).count(),
}
weight_count_from_weight

                                                                                

{'all_count': 2059285,
 '>1 weight': 632647,
 '>2 weight': 306843,
 '>4 weight': 131211,
 '>8 weight': 50722,
 '>16 weight': 17664,
 '>32 weight': 5591,
 '>64 weight': 1560}

In [98]:
coocurence_2019 =coocurence.filter(F.col("year")==2019).select("first","second",(1/F.col("count")).alias("inverseWeight")).filter(F.col("inverseWeight")<(1/64))

In [99]:
G5 = nx.from_pandas_edgelist(coocurence_2019.toPandas(),"first","second",["inverseWeight"])


                                                                                

In [100]:
CONNECTIVITY_sequence1_64=nx.betweenness_centrality(G5, normalized=False, weight ='inverseWeight')


In [101]:
CONNECTIVITY_sequence1_64

{'policy': 1932.0,
 'algorithms': 5220.0,
 'representation': 4407.0,
 'joint': 0.0,
 'principal': 0.0,
 'analysis': 18906.0,
 'detection': 75466.0,
 'anomaly': 0.0,
 'risk': 0.0,
 'space': 2318.0,
 'embedding': 152.0,
 'training': 29237.0,
 'learning': 1402875.5,
 'network': 250200.0,
 'analyze': 0.0,
 'time': 86774.0,
 'rate': 10550.0,
 'realtime': 0.0,
 'proposed': 60660.0,
 'solutions': 0.0,
 'propose': 32378.0,
 'classification': 7750.0,
 'transport': 0.0,
 'optimal': 6823.0,
 'show': 72328.0,
 'classical': 0.0,
 'visual': 14414.0,
 'stateoftheart': 7806.0,
 'depth': 2022.0,
 'mechanism': 0.0,
 'image': 99061.0,
 'testing': 1.0,
 'models': 46979.0,
 'lead': 0.0,
 'state': 4410.0,
 'based': 19563.0,
 'large': 26203.0,
 'amount': 0.0,
 'spaces': 0.0,
 'continuous': 2204.0,
 'estimation': 7052.0,
 'security': 4408.0,
 'privacy': 13987.0,
 'https': 0.0,
 'code': 11683.0,
 'information': 103705.0,
 'face': 6561.0,
 'techniques': 0.0,
 'demonstrate': 11010.0,
 'dataset': 24879.0,
 'word'

In [30]:
import networkx as nx
G1 = nx.from_pandas_edgelist(coocurence_2013.filter(F.col("inverseWeight")<(1/8)).toPandas(),"first","second",["inverseWeight"])
G2 = nx.from_pandas_edgelist(coocurence_2013.filter(F.col("inverseWeight")<(1/16)).toPandas(),"first","second",["inverseWeight"])
G3 = nx.from_pandas_edgelist(coocurence_2013.filter(F.col("inverseWeight")<(1/32)).toPandas(),"first","second",["inverseWeight"])
G4 = nx.from_pandas_edgelist(coocurence_2013.filter(F.col("inverseWeight")<(1/64)).toPandas(),"first","second",["inverseWeight"])

                                                                                

In [69]:
CONNECTIVITY_sequence1_8=nx.betweenness_centrality(G1, normalized=False, weight ='inverseWeight')
# 420s 1/8
CONNECTIVITY_sequence1_16=nx.betweenness_centrality(G2, normalized=False, weight ='inverseWeight')
# 72 s 1/16
CONNECTIVITY_sequence1_32=nx.betweenness_centrality(G3, normalized=False, weight ='inverseWeight')
# 9.5s 1/32
CONNECTIVITY_sequence1_64=nx.betweenness_centrality(G4, normalized=False, weight ='inverseWeight')
# 1.4s 1/64

                                                                                

In [71]:
weightless_CONNECTIVITY_sequence1_8 = nx.betweenness_centrality(G1, normalized=False)
# 151.2s 1/8
weightless_CONNECTIVITY_sequence1_16 = nx.betweenness_centrality(G2, normalized=False)
# 23.5s 1/16
weightless_CONNECTIVITY_sequence1_32 = nx.betweenness_centrality(G3, normalized=False)
# 3.5s
weightless_CONNECTIVITY_sequence1_64 = nx.betweenness_centrality(G4, normalized=False)
# 0.4s

In [97]:
import numpy as np

RMSE1 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8[key] for key,value in CONNECTIVITY_sequence1_8.items())).mean()**0.5
RMSE1_2 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8[key]-weightless_CONNECTIVITY_sequence1_16.get(key,0) for key,value in CONNECTIVITY_sequence1_8.items())).mean()**0.5
RMSE1_2_3 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8.get(key,0)-weightless_CONNECTIVITY_sequence1_16.get(key,0)-weightless_CONNECTIVITY_sequence1_32.get(key,0) for key,value in CONNECTIVITY_sequence1_8.items())).mean()**0.5
RMSE1_2_3_4 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8.get(key,0)-weightless_CONNECTIVITY_sequence1_16.get(key,0)- weightless_CONNECTIVITY_sequence1_32.get(key,0) - weightless_CONNECTIVITY_sequence1_64.get(key,0)for key,value in CONNECTIVITY_sequence1_8.items())).mean()**0.5



print(RMSE1,RMSE1_2,RMSE1_2_3,RMSE1_2_3_4)

46298.13358457783 38953.4687848016 36949.404097337654 36497.53992309111


In [94]:
RMSE2 = np.square(list(value-weightless_CONNECTIVITY_sequence1_16[key] for key,value in CONNECTIVITY_sequence1_16.items())).mean()**0.5
RMSE2_1 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8[key]-weightless_CONNECTIVITY_sequence1_16.get(key,0) for key,value in CONNECTIVITY_sequence1_16.items())).mean()**0.5
RMSE2_3 = np.square(list(value-weightless_CONNECTIVITY_sequence1_16.get(key,0)-weightless_CONNECTIVITY_sequence1_32.get(key,0) for key,value in CONNECTIVITY_sequence1_16.items())).mean()**0.5
RMSE2_1_3 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8.get(key,0)-weightless_CONNECTIVITY_sequence1_16.get(key,0)-weightless_CONNECTIVITY_sequence1_32.get(key,0) for key,value in CONNECTIVITY_sequence1_16.items())).mean()**0.5
RMSE2_3_4 = np.square(list(value-weightless_CONNECTIVITY_sequence1_16.get(key,0)-weightless_CONNECTIVITY_sequence1_32.get(key,0)-weightless_CONNECTIVITY_sequence1_64.get(key,0) for key,value in CONNECTIVITY_sequence1_16.items())).mean()**0.5
RMSE2_1_2_3 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8.get(key,0)-weightless_CONNECTIVITY_sequence1_16.get(key,0)- weightless_CONNECTIVITY_sequence1_32.get(key,0) - weightless_CONNECTIVITY_sequence1_64.get(key,0)for key,value in CONNECTIVITY_sequence1_16.items())).mean()**0.5
print(RMSE2,RMSE2_1,RMSE2_3,RMSE2_1_3,RMSE2_3_4,RMSE2_1_2_3)

14487.572333826238 28177.32880574557 11692.225105694035 30998.584544194266 11075.464694516244 31586.01066642077


In [95]:
RMSE3 = np.square(list(value-weightless_CONNECTIVITY_sequence1_32[key] for key,value in CONNECTIVITY_sequence1_32.items())).mean()**0.5
RMSE3_2 = np.square(list(value-weightless_CONNECTIVITY_sequence1_16.get(key,0)-weightless_CONNECTIVITY_sequence1_32.get(key,0) for key,value in CONNECTIVITY_sequence1_32.items())).mean()**0.5
RMSE3_4 = np.square(list(value-weightless_CONNECTIVITY_sequence1_32.get(key,0)-weightless_CONNECTIVITY_sequence1_64.get(key,0) for key,value in CONNECTIVITY_sequence1_32.items())).mean()**0.5
RMSE3_1_2 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8.get(key,0)-weightless_CONNECTIVITY_sequence1_16.get(key,0)-weightless_CONNECTIVITY_sequence1_32.get(key,0) for key,value in CONNECTIVITY_sequence1_32.items())).mean()**0.5
RMSE3_2_4 = np.square(list(value-weightless_CONNECTIVITY_sequence1_16.get(key,0)-weightless_CONNECTIVITY_sequence1_32.get(key,0)-weightless_CONNECTIVITY_sequence1_64.get(key,0) for key,value in CONNECTIVITY_sequence1_32.items())).mean()**0.5
RMSE3_1_2_4 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8.get(key,0)-weightless_CONNECTIVITY_sequence1_16.get(key,0)- weightless_CONNECTIVITY_sequence1_32.get(key,0) - weightless_CONNECTIVITY_sequence1_64.get(key,0)for key,value in CONNECTIVITY_sequence1_32.items())).mean()**0.5
print(RMSE3,RMSE3_2,RMSE3_4,RMSE3_1_2,RMSE3_2_4,RMSE3_1_2_4)

4051.314356346439 13841.297580301838 3175.2494189095637 68785.52860407454 14763.837791019403 69726.27046444634


In [96]:
RMSE4 = np.square(list(value-weightless_CONNECTIVITY_sequence1_64[key] for key,value in CONNECTIVITY_sequence1_64.items())).mean()**0.5
RMSE4_4 = np.square(list(value-weightless_CONNECTIVITY_sequence1_32.get(key,0)-weightless_CONNECTIVITY_sequence1_64.get(key,0) for key,value in CONNECTIVITY_sequence1_32.items())).mean()**0.5
RMSE4_3_2 = np.square(list(value-weightless_CONNECTIVITY_sequence1_16.get(key,0)-weightless_CONNECTIVITY_sequence1_32.get(key,0)-weightless_CONNECTIVITY_sequence1_64.get(key,0) for key,value in CONNECTIVITY_sequence1_64.items())).mean()**0.5
RMSE4_1_2_3 = np.square(list(value-weightless_CONNECTIVITY_sequence1_8.get(key,0)-weightless_CONNECTIVITY_sequence1_16.get(key,0)- weightless_CONNECTIVITY_sequence1_32.get(key,0) - weightless_CONNECTIVITY_sequence1_64.get(key,0)for key,value in CONNECTIVITY_sequence1_64.items())).mean()**0.5
print(RMSE4,RMSE4_4,RMSE4_3_2,RMSE4_1_2_3)

896.2447093846051 3175.2494189095637 31837.84991673649 114551.64294736707


In [None]:
# check proportionality


In [43]:
G

<networkx.classes.graph.Graph at 0x7f1d6f989870>