In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.mllib.fpm import FPGrowth
from operator import itemgetter

SparkContext.setSystemProperty('spark.executor.memory','6g')
sc = pyspark.SparkContext('local[*]')

In [2]:
import re
rdd = sc.textFile('publications.txt')
small_rdd = rdd.sample(False, 1e-3)

# RDDs for statistical analysis and further calculations
venue_rdd = rdd.filter(
            lambda l: re.match('^#c(.*)',l)).map(
            lambda l: re.match('^#c(.*)',l).group(1))

refrence_rdd = rdd.filter(
            lambda l: re.match('^#%(.*)',l)).map(
            lambda l: re.match('^#%(.*)',l).group(1)).filter(lambda l: l!='')

author_rdd = rdd.filter(
            lambda l: re.match('^#@(.*)',l)).map(
            lambda l: re.match('^#@(.*)',l).group(1))

year_rdd = rdd.filter(
            lambda l: re.match('^#t(.*)',l)).map(
            lambda l: re.match('^#t(.*)',l).group(1)).filter(lambda l: l!='')


In [3]:
# RDD for authors
transactions = author_rdd.filter(lambda l: l.strip() != "").map(lambda line:list(set(line.strip().split(','))))

# QUESTION 1

To parse publications.txt I have used sed and awk and I have got the following results:


- Index is at not the first line for every record in Publications.txt while in AP_Train.txt it was the very first line.

- Every line in publications.txt has uses nothing as delimeter to seprate the identifying symbols with the string it identifies.

- Publications.txt does have empty refrences many records while AP_Train.txt had refrences for some records.

- Authors are delimeted by ',' in publications.txt while ';' in AP_Train.txt



Difference in number of number of publications, authors, venues, references, and years of publication for two files


|              Counts Of              | Publications.txt | AP_train.txt |
|:-----------------------------------:|:----------------:|:------------:|
| Venue (Unique Count)                |       8708       |    255685    |
| Publications/Indexes (Unique Count) |      2146341     |    1976815   |
| Authors (Unique Count)              |      1232494     |    1478733   |
| Refrences                           |     528264       |    871089    |
| Years                               |        80        |      79      |


# QUESTION 2

In [4]:
# Part-A FPGrowth with min threshold = 1e-4
model = FPGrowth.train(transactions, minSupport=1e-4, numPartitions=10)
result = model.freqItemsets().collect()
for fi in result[:10]:
    print(fi)    

FreqItemset(items=['David Maier'], freq=227)
FreqItemset(items=['Michael T. Goodrich'], freq=309)
FreqItemset(items=['Ralf Steinmetz'], freq=373)
FreqItemset(items=['Wayne Wolf'], freq=243)
FreqItemset(items=['Hussein T. Mouftah'], freq=237)
FreqItemset(items=['Micha Sharir'], freq=466)
FreqItemset(items=['Jin Li'], freq=254)
FreqItemset(items=['Ying Wang'], freq=352)
FreqItemset(items=['Kiyohiro Shikano'], freq=213)
FreqItemset(items=['Gene Tsudik'], freq=220)


In [5]:
# Part-A FPGrowth with min threshold = 1e-5
model_b = FPGrowth.train(transactions, minSupport=1e-5, numPartitions=10)
result_b = model_b.freqItemsets().collect()
for fi in result_b[:10]:
    print(fi)    

FreqItemset(items=['Slim Essid'], freq=26)
FreqItemset(items=['Pinyi Ren'], freq=32)
FreqItemset(items=['Michael Fink'], freq=85)
FreqItemset(items=['Michael Fink', 'Thomas Eiter'], freq=67)
FreqItemset(items=['Michael Fink', 'Hans Tompits'], freq=24)
FreqItemset(items=['Keita Matsuo'], freq=22)
FreqItemset(items=['Keita Matsuo', 'Leonard Barolli'], freq=22)
FreqItemset(items=['Christopher Rasmussen'], freq=26)
FreqItemset(items=['Pierre Duhamel'], freq=120)
FreqItemset(items=['George Candea'], freq=46)


In [6]:
# Part-B FPGrowth with min threshold = 0.5e-5
model_top5 = FPGrowth.train(transactions, minSupport=0.5e-5, numPartitions=10)
result_top5 = model_top5.freqItemsets().collect()
result_top5.sort(key=itemgetter(1),reverse=True)
for fi in result_top5[:10]:
    print(fi)
        

FreqItemset(items=['Wei Wang'], freq=1293)
FreqItemset(items=['Wei Zhang'], freq=856)
FreqItemset(items=['Lei Zhang'], freq=841)
FreqItemset(items=['Wei Li'], freq=805)
FreqItemset(items=['H. Vincent Poor'], freq=735)
FreqItemset(items=['Jun Wang'], freq=717)
FreqItemset(items=['Philip S. Yu'], freq=711)
FreqItemset(items=['Wen Gao'], freq=707)
FreqItemset(items=['Thomas S. Huang'], freq=691)
FreqItemset(items=['Lei Wang'], freq=690)


In [None]:
# Part-A FPGrowth with min threshold = 1e-6 {This doesn not work} - Do not run this!
# model_d = FPGrowth.train(transactions, minSupport=1e-6, numPartitions=10)
# result_d = model_d.freqItemsets().collect()
# for fi in result_d[:10]:
#    print(fi)    

### What Happens When we successively decrease the Threshold:

When we successively decrease the Threshold, the number of association rules increases as number of transactions with min threshold increase and eventually programs runs out of memory and an exception is thrown because of memory spills. 

In [7]:
# Question 2-B

author = ['Rakesh Agrawal', 'Jiawei Han', 'Zoubin Ghahramani', 'Christos Faloutsos']

ra_set = list();
jh_set = list();
zg_set = list();
cf_set = list();

# result_top5 is sorted in the order of frequency so code exploits that 
# feature to calculate top 5 co-authors for given authors
for fi in result_top5:
    if author[0] in fi[0] and len(ra_set) < 5 and len(fi[0]) > 1:
        for k in fi[0]:
            if k != author[0] and k not in ra_set:
                ra_set.append(k)
    if author[1] in fi[0] and len(jh_set) < 5 and len(fi[0]) > 1:
        for k in fi[0]:
            if k != author[1] and k not in jh_set:
                jh_set.append(k)
    if author[2] in fi[0] and len(zg_set) < 5 and len(fi[0]) > 1:
         for k in fi[0]:
            if k != author[2] and k not in zg_set:
                zg_set.append(k)
    if author[3] in fi[0] and len(cf_set) < 5 and len(fi[0]) > 1:
         for k in fi[0]:
            if k != author[3] and k not in cf_set:
                cf_set.append(k)
        
print("Top 5 Co-Authors for")
print(author[0])
print(list(ra_set))
print('')
print(author[1])
print(list(jh_set))
print('')
print(author[2])
print(list(zg_set))
print('')
print(author[3])
print(list(cf_set))
        

Top 5 Co-Authors for
Rakesh Agrawal
['Ramakrishnan Srikant', 'Jerry Kiernan', 'H. V. Jagadish', 'Michael J. Carey', 'Yirong Xu']

Jiawei Han
['Xifeng Yan', 'Philip S. Yu', 'Jian Pei', 'Yizhou Sun', 'Xin Jin']

Zoubin Ghahramani
['David L. Wild', 'Katherine A. Heller', 'Michael I. Jordan']

Christos Faloutsos
['Hanghang Tong', 'Spiros Papadimitriou', 'Jimeng Sun', 'Agma J. M. Traina', 'Caetano Traina Jr.']


# Question 3

In [8]:
# Questions 3
# For this question authors are considerd as baskets and venues are transactions where
# they have atleast one publication. So Parsing the text to make lookup for author and
# venues and then parallelize the list of venues as transactions 
file = open('publications.txt')

author_venue = {}
author_keep = []
for line in file:
        if line.startswith('#@'):
            authors = line[2:].split(',')
            for i, s in enumerate(authors):
                val = s.strip()
                if val == '':
                    continue
                author_keep.append(val)
                if val not in author_venue:
                    author_venue[val] = set()
    
            
        elif line.startswith('#c'):
            publication_venue = line[2:]
            venue_keep = publication_venue.strip()
            for aut in author_keep:
                author_venue[aut].add(venue_keep)
            author_keep[:] = []    


# Write to file            
file = open('dum.txt','w') 
for key,value in author_venue.items():
    str = ""
    for val in value:
        str += val + '~'
    
    str = str[:-1]
    file.write(str)
    file.write('\n')
file.close()     

# Make an RDD of venues 
venue_basket = sc.textFile('dum.txt').map(lambda l:list(set(l.split('~'))))    

In [9]:
# Questions 3(a) with Threshold = 1e-3
model_venue_auth = FPGrowth.train(venue_basket, minSupport=1e-3, numPartitions=1)
result_venue_auth = model_venue_auth.freqItemsets().collect()
result_venue_auth.sort(key=itemgetter(1),reverse=True)
for fi in result_venue_auth[:10]:
   print(fi)


FreqItemset(items=['CoRR'], freq=51437)
FreqItemset(items=['IEICE Transactions'], freq=22145)
FreqItemset(items=['Bioinformatics'], freq=18669)
FreqItemset(items=['ICRA'], freq=17989)
FreqItemset(items=['Nucleic Acids Research'], freq=17378)
FreqItemset(items=['ICC'], freq=14904)
FreqItemset(items=['NeuroImage'], freq=14700)
FreqItemset(items=['BMC Bioinformatics'], freq=14448)
FreqItemset(items=['GLOBECOM'], freq=14374)
FreqItemset(items=['ISCAS'], freq=13498)


In [None]:
# Questions 3(a) with Threshold = 1e-4 {This fails because of the reason mentioned below}
# model_venue_auth = FPGrowth.train(venue_basket, minSupport=1e-4, numPartitions=1)
# result_venue_auth = model_venue_auth.freqItemsets().collect()
# result_venue_auth.sort(key=itemgetter(1),reverse=True)
# for fi in result_venue_auth[:10]:
#   print(fi)


In [10]:
# Questions 3(a) with Threshold = 0.4e-3
model_venue_auth_top10 = FPGrowth.train(venue_basket, minSupport=0.4e-3, numPartitions=1)
result_venue_auth_top10 = model_venue_auth_top10.freqItemsets().collect()
result_venue_auth_top10.sort(key=itemgetter(1),reverse=True)
for fi in result_venue_auth_top10[:10]:
   print(fi)


FreqItemset(items=['CoRR'], freq=51437)
FreqItemset(items=['IEICE Transactions'], freq=22145)
FreqItemset(items=['Bioinformatics'], freq=18669)
FreqItemset(items=['ICRA'], freq=17989)
FreqItemset(items=['Nucleic Acids Research'], freq=17378)
FreqItemset(items=['ICC'], freq=14904)
FreqItemset(items=['NeuroImage'], freq=14700)
FreqItemset(items=['BMC Bioinformatics'], freq=14448)
FreqItemset(items=['GLOBECOM'], freq=14374)
FreqItemset(items=['ISCAS'], freq=13498)


### What Happens When we successively decrease the Threshold:

When we successively decrease the Threshold, the number of association rules increases as number of transactions with min threshold increase and eventually programs runs out of memory and an exception is thrown. Also note that fact that we need to set number of partitions as 1, program fails for partitions greater than one because it results in more memory spill in Spark RDDs.

In [11]:
vens = ['NIPS', 'KDD', 'VLDB', 'INFOCOM','ACL']

machine_learning_list = list();
data_mining_list = list();
databases_list = list();
computer_networks_list = list();
nlp_list = list();

# result_venue_auth_top10 is sorted in the order of frequency so 
# this code snippet exploits that fact and finds top 10 venues that
# author publish in for each area
for fi in result_venue_auth_top10:
    if vens[0] in fi[0] and len(machine_learning_list) < 10 and len(fi[0]) > 1:
        for k in fi[0]:
            if k != vens[0] and k not in machine_learning_list:
                machine_learning_list.append(k)
    if vens[1] in fi[0] and len(data_mining_list) < 10 and len(fi[0]) > 1:
        for k in fi[0]:
            if k != vens[1] and k not in data_mining_list:
                data_mining_list.append(k)
    if vens[2] in fi[0] and len(databases_list) < 10 and len(fi[0]) > 1:
        for k in fi[0]:
            if k != vens[2] and k not in databases_list:
                databases_list.append(k)
    if vens[3] in fi[0] and len(computer_networks_list) < 10 and len(fi[0]) > 1:
        for k in fi[0]:
            if k != vens[3] and k not in computer_networks_list:
                computer_networks_list.append(k)
    if vens[4] in fi[0] and len(nlp_list) < 10 and len(fi[0]) > 1:
        for k in fi[0]:
            if k != vens[4] and k not in nlp_list:
                nlp_list.append(k)

print('Top 10 List for Machine Learning')                
print(machine_learning_list)
print('')
print('Top 10 List for Data Mining ')
print(data_mining_list)
print('')
print('Top 10 List for Databases')
print(databases_list)
print('')
print('Top 10 List for Computer Networks')
print(computer_networks_list)
print('')
print('Top 10 List for NLP')
print(nlp_list)


Top 10 List for Machine Learning
['CoRR', 'ICML', 'Neural Computation', 'Journal of Machine Learning Research - Proceedings Track', 'Journal of Machine Learning Research', 'IEEE Trans. Pattern Anal. Mach. Intell.', 'CVPR', 'Neurocomputing', 'Neural Networks', 'IJCAI']

Top 10 List for Data Mining 
['CoRR', 'ICDM', 'CIKM', 'IEEE Trans. Knowl. Data Eng.', 'SDM', 'ICML', 'WWW']

Top 10 List for Databases
['ICDE', 'SIGMOD Conference', 'CoRR', 'IEEE Trans. Knowl. Data Eng.', 'SIGMOD Record', 'EDBT', 'CIKM', 'IEEE Data Eng. Bull.', 'VLDB J.', 'ACM Trans. Database Syst.']

Top 10 List for Computer Networks
['GLOBECOM', 'ICC', 'CoRR', 'IEEE/ACM Trans. Netw.', 'IEEE Journal on Selected Areas in Communications', 'Computer Networks', 'Computer Communications', 'ICDCS', 'IEEE Trans. Parallel Distrib. Syst.', 'WCNC']

Top 10 List for NLP
['COLING', 'LREC', 'CoRR', 'EMNLP', 'HLT-NAACL', 'INTERSPEECH', 'Computational Linguistics']
