# 1. Set up spark context and SparkSession

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()


# 2. Load dataset

In [2]:
raw_data = spark.read.csv('InputFile.txt', inferSchema=True, sep='\t',header=True).toDF('words')

In [3]:
raw_data.count()

100000

In [4]:
raw_data.distinct().count()

93405

# 3.  Clean and Manipulation

In [5]:
u_raw_data= raw_data.distinct()

In [6]:
u_raw_data.count()

93405

In [7]:
type(u_raw_data)

pyspark.sql.dataframe.DataFrame

In [8]:
u_raw_data.show(4, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                                                                                                                                    |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[ACTH Syndrome, Ectopic]  [Adrenal Gland Neoplasms]  [Adrenocorticotropic Hormone]  [Corticotropin-Releasing Hormone]  [Cushing Syndrome]  [Dexamethasone]  [Diagnosis, Differential]  [Humans]  [Hydrocortisone]                                  

In [9]:
## define udf function
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import *
def keywords_parser(s):
    s = s.replace(',', '')
    s1 = s.strip('[|]')
    s2 = re.split('\]\s+\[', s1)
    return [''.join(y.split()) for y in s2]

In [10]:
%time keywords_parser_udf = udf(keywords_parser, ArrayType(StringType()))

CPU times: user 5.94 ms, sys: 2.04 ms, total: 7.97 ms
Wall time: 47.7 ms


In [11]:
%time df = u_raw_data.select(keywords_parser_udf(u_raw_data.words)).toDF('term')

CPU times: user 3.83 ms, sys: 1.78 ms, total: 5.61 ms
Wall time: 67.4 ms


In [13]:
df.show(4, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|term                                                                                                                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[ACTHSyndromeEctopic, AdrenalGlandNeoplasms, AdrenocorticotropicHormone, Corticotropin-ReleasingHormone, CushingSyndrome, Dexamethasone, DiagnosisDifferential, Humans, Hydrocortisone]                              |
|[AntibodyFormation, CultureTechniques, Female, HeLaCells, Humans, ImmuneSera, Neoplasms, Stomach, Uterus, gamma-Globulins]             

# 4. Build corpus and document-term matrix

In [12]:
from pyspark.ml.feature import CountVectorizer
%time count_vectorizer_wo = CountVectorizer(inputCol='term', outputCol='features')
# with total unique vocabulary
countVectorizer_mod_wo = count_vectorizer_wo.fit(df)
countVectorizer_twitter_wo = countVectorizer_mod_wo.transform(df)
# with truncated unique vocabulary (99%)
count_vectorizer = CountVectorizer(vocabSize=48,inputCol='term',outputCol='features')
countVectorizer_mod = count_vectorizer.fit(df)
countVectorizer_twitter = countVectorizer_mod.transform(df)

CPU times: user 835 µs, sys: 0 ns, total: 835 µs
Wall time: 19.7 ms


### 4.1 build corpus

In [13]:
#total unique vocabulary
voca_wo = countVectorizer_mod_wo.vocabulary
len(voca_wo)

17373

In [14]:
voca = countVectorizer_mod.vocabulary
len(voca)

48

In [15]:
with open("voca.txt", "w") as f:
    for s in voca:
        f.write(str(s) +"\n")

### 4.2 build document-term matrix

In [16]:
df = countVectorizer_twitter.select('features')

In [17]:
df.count()

93405

In [18]:
df.show(truncate=False)

+----------------------------------------------------------+
|features                                                  |
+----------------------------------------------------------+
|(48,[0,26],[1.0,1.0])                                     |
|(48,[0,2,13],[1.0,1.0,1.0])                               |
|(48,[1,22,24],[1.0,1.0,1.0])                              |
|(48,[0,28],[1.0,1.0])                                     |
|(48,[],[])                                                |
|(48,[0,3,4,44],[1.0,1.0,1.0,1.0])                         |
|(48,[0,2,3,4,5,9],[1.0,1.0,1.0,1.0,1.0,1.0])              |
|(48,[0,2,3,20,39],[1.0,1.0,1.0,1.0,1.0])                  |
|(48,[1,42],[1.0,1.0])                                     |
|(48,[0,28],[1.0,1.0])                                     |
|(48,[0],[1.0])                                            |
|(48,[1,18,39],[1.0,1.0,1.0])                              |
|(48,[0],[1.0])                                            |
|(48,[0],[1.0])         

In [19]:
type(df)

pyspark.sql.dataframe.DataFrame

In [20]:
df.count()

93405

# 5. Pyspark sparse vectors to scipy sparse matrix

In [21]:
from pyspark.ml.linalg import SparseVector
from operator import attrgetter
features = df.rdd.map(attrgetter("features"))

In [22]:
type(features)

pyspark.rdd.PipelinedRDD

In [23]:
indexed_features = features.zipWithIndex()

In [24]:
def explode(row):
    vec, i = row
    for j, v in zip(vec.indices, vec.values):
        yield i, j, v

entries = indexed_features.flatMap(explode)

In [25]:
row_indices, col_indices, data = zip(*entries.collect())

In [26]:
shape = (
    df.count(),
    df.rdd.map(attrgetter("features")).first().size
)

In [27]:
from scipy.sparse import csr_matrix
mat = csr_matrix((data, (row_indices, col_indices)), shape=shape)

In [28]:
type(mat)

scipy.sparse.csr.csr_matrix

# 6. Build term-term matrix

In [29]:
%time A = mat.transpose().dot(mat)

CPU times: user 11.3 ms, sys: 0 ns, total: 11.3 ms
Wall time: 10.6 ms


In [30]:
B = A.toarray()

In [31]:
type(B)

numpy.ndarray

In [32]:
B.shape

(48, 48)

# 7. Extract the frequency large than 50

In [33]:
import numpy as np
import pandas as pd
uptri = np.triu(B, 1)

In [34]:
uptri[uptri>50]

array([  5173.,  16741.,  13883.,  10391.,   8181.,    754.,   5580.,
         5301.,   5043.,   3529.,    643.,   3104.,   2282.,    567.,
         2526.,    836.,   1379.,    453.,    481.,   2115.,   1036.,
          606.,   1392.,   1059.,    825.,   1518.,   1099.,   1189.,
          264.,   1101.,    870.,    720.,    801.,    800.,    557.,
          735.,    275.,    237.,    688.,    953.,    560.,    276.,
          312.,    815.,    409.,    279.,    543.,   3762.,   3746.,
          319.,    208.,   5581.,    178.,    122.,    135.,    891.,
         1283.,    101.,    453.,   2805.,     81.,   1471.,   1080.,
         2154.,   2056.,     83.,    782.,   1185.,    132.,    491.,
          690.,     55.,    131.,     82.,   1367.,    215.,    348.,
          171.,    399.,    377.,    547.,    138.,    923.,    535.,
          320.,     53.,    233.,    349.,    189.,    122.,    393.,
          550.,    383.,   9806.,   7411.,   6002.,   1181.,   2537.,
         3818.,   34

In [35]:
tempM = np.where( uptri >=50 )

In [36]:
index = pd.DataFrame(list(tempM)).transpose()

In [37]:
value = pd.DataFrame(list(uptri[uptri>=50]))

In [38]:
vocab = pd.DataFrame(voca)
vocab.head()

Unnamed: 0,0
0,Humans
1,Animals
2,Female
3,Male
4,Adult


In [39]:
indexFreq = pd.concat([index,value],axis=1)
indexFreq.columns = ['index1','index2','freq']

In [40]:
indexFreq.head()

Unnamed: 0,index1,index2,freq
0,0,1,5173
1,0,2,16741
2,0,3,13883
3,0,4,10391
4,0,5,8181


In [41]:
join1 = indexFreq.join(vocab, on='index2',how='left')
join1.columns = ['index1','index2','freq','term.x']

In [42]:
join2 = join1.join(vocab, on='index1',how='left')
join2.columns = ['index1','index2','freq','term.x','term.y']

In [43]:
join2.head()

Unnamed: 0,index1,index2,freq,term.x,term.y
0,0,1,5173,Animals,Humans
1,0,2,16741,Female,Humans
2,0,3,13883,Male,Humans
3,0,4,10391,Adult,Humans
4,0,5,8181,MiddleAged,Humans


In [44]:
output1 = join2[['term.x','term.y','freq']]

In [45]:
output1.head()

Unnamed: 0,term.x,term.y,freq
0,Animals,Humans,5173
1,Female,Humans,16741
2,Male,Humans,13883
3,Adult,Humans,10391
4,MiddleAged,Humans,8181


In [46]:
output_sorted = output1.sort('freq',ascending=False)

In [47]:
output_sorted.head()

Unnamed: 0,term.x,term.y,freq
1,Female,Humans,16741
2,Male,Humans,13883
3,Adult,Humans,10391
93,Male,Female,9806
4,MiddleAged,Humans,8181


In [48]:
spark.createDataFrame(output_sorted).show(6)

+----------+------+-------+
|    term.x|term.y|   freq|
+----------+------+-------+
|    Female|Humans|16741.0|
|      Male|Humans|13883.0|
|     Adult|Humans|10391.0|
|      Male|Female| 9806.0|
|MiddleAged|Humans| 8181.0|
|     Adult|Female| 7411.0|
+----------+------+-------+
only showing top 6 rows



In [49]:
%time output_sorted.to_csv('data_out/ouput_Pyspark_w.csv', sep='\t', encoding='utf-8')

CPU times: user 9.68 ms, sys: 529 µs, total: 10.2 ms
Wall time: 9.84 ms


In [50]:
output = output_sorted[['term.x','term.y']]

In [51]:
output.head()

Unnamed: 0,term.x,term.y
1,Female,Humans
2,Male,Humans
3,Adult,Humans
93,Male,Female
4,MiddleAged,Humans


In [52]:
spark.createDataFrame(output).show(6)

+----------+------+
|    term.x|term.y|
+----------+------+
|    Female|Humans|
|      Male|Humans|
|     Adult|Humans|
|      Male|Female|
|MiddleAged|Humans|
|     Adult|Female|
+----------+------+
only showing top 6 rows



In [53]:
%time output.to_csv('data_out/ouput_Pyspark_wo.csv', sep='\t', encoding='utf-8')

CPU times: user 7.28 ms, sys: 1.17 ms, total: 8.45 ms
Wall time: 6.61 ms


In [54]:
output.shape

(470, 2)