In [35]:
import csv
import re

class createBipFromFriendlist(object):
    
    def __init__(self, threshold = 5):
        """
        adjlist type: list[list] first item is follower_id, the rest are the IDs of its friends
        rtype : None
        """
        self.adjlist = None
        self.friendsCount = None
        #self.edgelist = []  #only keep  
        #self.reducedAdjlist = []
        
    def createAdjlist(self, input_file, verbose = True):
        """
        : type input_file: list[str] 
        a text file name, each row is a string 
        starting with a screen_name, and a list of id_str's  of his fiends, separated by ','
        : rtype : list[list]
         remove some rows, with 0 friends or have 'NA' as their friends, which indicates 
                    downloading unsucessfully
        """

        f = open(input_file, "r")
        adjlist = []
        failed_sn = []
        zero_sn = []
        #removed those having 0 friends, 
        #removed those having 'NA' as friends -- did not download friends list sucessfullly,
        for line in f:
            a = line.split(',')
            a =[re.sub("\n", "", x) for x in a]
            if len(a) == 1:
                zero_sn.append(a[0])
            elif len(a) ==2 and a[1] =='NA' :
                failed_sn.append(a[0]) 
            else: 
                adjlist.append(a)    #adjlist is a list[list]

        if verbose:
            print ("success>0: "+str(len(adjlist)) + ",  failed: " +str(len(failed_sn))+ ", zeros: " + str(len(zero_sn)))
        self.adjlist = adjlist

    def calculateFriendsCount(self, threshold = 1):
        """
        rtype: dict()
        key: friend IDs, values, numbers of followers in the graph
        keep the friendIds with >= self.threshold followers
        """
        friendsCount = {}
        for i in range(len(self.adjlist)):
            line = self.adjlist[i]
            for id_str in line[1:]:
                if id_str in friendsCount.keys():
                    friendsCount[id_str] = friendsCount[id_str] +1
                else:
                    friendsCount[id_str] = 1      
                    if len(friendsCount) % 1000000 ==0 :
                        print (i, len(friendsCount))
        for key in list(friendsCount.keys()):
            if friendsCount[key] < threshold:
                del friendsCount[key]
        self.friendsCount = friendsCount 
                
    def createEdgelist(self, output_file, threshold = 1):
        """
        create the edgelist from the adjlist --  write out directly to save memory&time
        type: list[list]
        list in side of [a,b] -- edge
        total length of outside list == number of edges
        """
        self.calculateFriendsCount(threshold)
        with open(output_file,'w') as out:
            csv_out=csv.writer(out)
            csv_out.writerow(['row_id','follower_id_str','friends_id_str'])
            for line in self.adjlist:
                for id_str in line[1:]:
                    if id_str in self.friendsCount.keys():
                        row = (line[0],id_str)
                        csv_out.writerow(row)
            
   
                        



In [None]:
import csv
import time
start_time = time.time()
# your code

input_file = "../edgelist_Feb27/2.txt"
output_file = "../edgelist_Feb27/edgelist_2.csv"
BipGraph = createBipFromFriendlist()
BipGraph.createAdjlist(input_file)

In [38]:
BipGraph.createEdgelist(output_file, threshold = 5)

493 1000000
1179 2000000
2071 3000000
2946 4000000
4107 5000000
5254 6000000
6332 7000000
7634 8000000
8960 9000000
10606 10000000
12188 11000000
13692 12000000
15403 13000000
17100 14000000
19009 15000000
21021 16000000
23116 17000000
25201 18000000
27454 19000000
29783 20000000
32441 21000000
34975 22000000
37567 23000000
40496 24000000
43470 25000000
46645 26000000
49825 27000000
53228 28000000


In [39]:
import csv
import time
start_time = time.time()
# your code

input_file = "../edgelist_Feb27/3.txt"
output_file = "../edgelist_Feb27/edgelist_3.csv"
BipGraph = createBipFromFriendlist()
BipGraph.createAdjlist(input_file)
BipGraph.createEdgelist(output_file, threshold = 5)

success>0: 48958,  failed: 12018, zeros: 24
874 1000000
2051 2000000
3409 3000000
5088 4000000
6657 5000000
8469 6000000
10394 7000000
12324 8000000
14379 9000000
16802 10000000
19119 11000000
21515 12000000
24083 13000000
26853 14000000
29672 15000000
32731 16000000
36069 17000000
39317 18000000
42682 19000000
46226 20000000


In [12]:
BipGraph.cleanAdjlist(input_file)

success>0: 52607,  failed: 9130, zeros: 61


In [16]:
def calculateFriendsCount(adjlist, threshold = 1):
    """
    rtype: dict()
    key: friend IDs, values, numbers of followers in the graph
    keep the friendIds with >= self.threshold followers
    """
    friendsCount = {}
    for i in range(len(adjlist)):
        line = adjlist[i]
        for id_str in line[1:]:
            if id_str in friendsCount.keys():
                friendsCount[id_str] = friendsCount[id_str] +1
            else:
                friendsCount[id_str] = 1      
                if len(friendsCount) % 1000000 ==0 :
                    print (i, len(friendsCount))
                    print(i, id_str)
    for key in list(friendsCount.keys()):
        if friendsCount[key] < threshold:
            del friendsCount[key]
    return friendsCount 
        
followerCount = calculateFriendsCount(BipGraph.adjlist, threshold = 5)


131 1000000
131 393816869
198 2000000
198 24921751
295 3000000
295 265577870
460 4000000
460 41027920
863 5000000
863 3415034775
1360 6000000
1360 3010518371
1869 7000000
1869 2364755330
2425 8000000
2425 1454413304
2663 9000000
2663 123349561
2926 10000000
2926 101973739
3174 11000000
3174 3309451860
3587 12000000
3587 553207180
3946 13000000
3946 294005706
4319 14000000
4319 79532591
4674 15000000
4674 413842504
4994 16000000
4994 521634059
5267 17000000
5267 489212364
5631 18000000
5631 103565249
5975 19000000
5975 573905033
6371 20000000
6371 3485922380
6850 21000000
6850 36407822
7327 22000000
7327 2600603707
7715 23000000
7715 542383330
8244 24000000
8244 713178252714622980
8668 25000000
8668 821575237
9363 26000000
9363 580450413
9847 27000000
9847 3787921693
10503 28000000
10503 212130994
11154 29000000
11154 711969720724037632
11724 30000000
11724 4832247101
12475 31000000
12475 1395370890
13202 32000000
13202 231168093
14207 33000000
14207 1483524625
14987 34000000
14987 1229

In [22]:
len(followerCount)
sum(followerCount.values())
with open('follower_counts.txt', 'w') as out:
    csv_out=csv.writer(out)
    for row in followerCount:
        csv_out.writerow(row)

In [34]:
 def writeEdgeList_csv(self,output_file):
        with open(output_file,'w') as out:
            csv_out=csv.writer(out)
            csv_out.writerow(['row_id','follower_id_str','friends_id_str'])
            for row in self.edgelist:
                csv_out.writerow(row)

def createEdgelist(adjlist, friendsCount, output_file, threshold):
    """
    create the edgelist from the adjlist -- update self.edgelist
    type: list[list]
    list in side of [a,b] -- edge
    total length of outside list == number of edges
    """
    with open(output_file,'w') as out:
        csv_out=csv.writer(out)
        csv_out.writerow(['row_id','follower_id_str','friends_id_str'])
        for line in adjlist:
            #a = []
            #a.append(line[0])
            for id_str in line[1:]:
                if id_str in friendsCount.keys():
                    row = (line[0],id_str)
                    csv_out.writerow(row)

createEdgelist(BipGraph.adjlist, followerCount,  "edgelist.csv", 5)

In [None]:
BipGraph.createEdgelist(threshold = 5)
BipGraph.writeEdgeList_csv(output_file)
elapsed_time = time.time() - start_time
print(elapsed_time)

In [29]:
len(edgelist1)

NameError: name 'edgelist1' is not defined