In [10]:
import csv
import re

class createBipFromFriendlist(object):
    
    def __init__(self, threshold = 5):
        """
        adjlist type: list[list] first item is follower_id, the rest are the IDs of its friends
        rtype : None
        """
        self.adjlist = None
        self.friendsCount = None
        self.threshold = threshold
        
    def createAdjlist(self, input_file, verbose = True):
        """
        : type input_file: list[str] 
        a text file name, each row is a string 
        starting with a screen_name, and a list of id_str's  of his fiends, separated by ','
        : rtype : list[list]
         remove some rows, with 0 friends or have 'NA' as their friends, which indicates 
                    downloading unsucessfully
        """
        
        f = open(input_file, "r")
        adjlist = []; failed_sn = []
        zero_sn = []
        #removed those having 0 friends, 
        #removed those having 'NA' as friends -- did not download friends list sucessfullly,
        for line in f:
            a = line.split(',')
            a =[re.sub("\n", "", x) for x in a]
            if len(a) == 1:
                zero_sn.append(a[0])
            elif len(a) ==2 and a[1] =='NA' :
                failed_sn.append(a[0]) 
            else: 
                adjlist.append(a)    #adjlist is a list[list]

        if verbose:
            print ("success>0: "+str(len(adjlist)) + ",  failed: " +str(len(failed_sn))+ ", zeros: " + str(len(zero_sn)))
        self.adjlist = adjlist

    def calculateFriendsCount(self, threshold = 1):
        """
        rtype: dict()
        key: friend IDs, values, numbers of followers in the graph
        keep the friendIds with >= self.threshold followers
        """
        friendsCount = {}
        for i in range(len(self.adjlist)):
            line = self.adjlist[i]
            for id_str in line[1:]:
                if id_str in friendsCount.keys():
                    friendsCount[id_str] = friendsCount[id_str] +1
                else:
                    friendsCount[id_str] = 1      
                    if len(friendsCount) % 5000000 ==0 :
                        print (i, len(friendsCount))
        for key in list(friendsCount.keys()):
            if friendsCount[key] < threshold:
                del friendsCount[key]
        self.friendsCount = friendsCount 
                
    def createEdgelist(self, output_file, threshold = 1):
        """
        create the edgelist from the adjlist --  write out directly to save memory&time
        type: list[list]
        list in side of [a,b] -- edge
        total length of outside list == number of edges
        """
        self.calculateFriendsCount(threshold)
        with open(output_file,'w') as out:
            csv_out=csv.writer(out)
            csv_out.writerow(['followers_sn','friends_id_str'])
            for line in self.adjlist:
                for id_str in line[1:]:
                    if id_str in self.friendsCount.keys():
                        row = (line[0],id_str)
                        csv_out.writerow(row)
            

   



In [1]:
import csv
import re

class createBipFromFriendlist_memEff(object):
    
    def __init__(self, threshold = 5):
        """
        adjlist type: list[list] first item is follower_id, the rest are the IDs of its friends
        rtype : None
        """
        self.adjlist = None   #sparse
        self.friendsCount = None
        self.threshold = threshold
        #self.edgelist = []  #only keep  
        #self.reducedAdjlist = []
        
    def calculateFriendsCount(self, input_file, verbose = True):
        """
        : type input_file: list[str] 
        a text file name, each row is a string 
        starting with a screen_name, and a list of id_str's  of his fiends, separated by ','
        : rtype : list[list]
         remove some rows, with 0 friends or have 'NA' as their friends, which indicates 
                    downloading unsucessfully
        """

        f = open(input_file, "r")
        success_sn = []; 
        zero_sn = []   #removed those having 0 friends,  
        failed_sn = []; #removed those having 'NA' as friends -- did not download friends list sucessfullly,
        friendsCount = {}
        for line in f:
            a = line.split(',')
            a =[re.sub("\n", "", x) for x in a]
            if len(a) == 1:
                zero_sn.append(a[0])
            elif len(a) ==2 and a[1] =='NA' :
                failed_sn.append(a[0]) 
            else: 
                success_sn.append(a[0]);
                for id_str in a[1:]:
                    if id_str in friendsCount.keys():
                        friendsCount[id_str] = friendsCount[id_str] +1
                    else:
                        friendsCount[id_str] = 1      
            if verbose and len(success_sn) % 10000 ==0 :
                print (len(success_sn), "followers;  number of friends:", len(friendsCount))
        f.close()        
        for key in list(friendsCount.keys()):
            if friendsCount[key] < self.threshold:
                del friendsCount[key]        
        self.friendsCount = friendsCount   
        
        if verbose:
            print ("after removed friends with less than "+ str(self.threshold)+ ", "str(len(friendsCount))+" friends left.")
            print ("success>0: "+str(len(success_sn)) + ",  failed: " +str(len(failed_sn))+ ", zeros: " + str(len(zero_sn)))
   
    def createAdjlist(self, input_file, verbose = True):
        """
        : type input_file: list[str] 
        a text file name, each row is a string 
        starting with a screen_name, and a list of id_str's  of his fiends, separated by ','
        : rtype : list[list]
         remove some rows, with 0 friends or have 'NA' as their friends, which indicates 
                    downloading unsucessfully
        """
        self.calculateFriendsCount(input_file)
        f = open(input_file, "r")
        success_sn = [];        
        zero_sn = []  #removed those having 0 friends,  
        failed_sn = [] #removed those having 'NA' as friends -- did not download friends list sucessfullly,
        adjlist = []; 
        for line in f:
            a = line.split(',')
            a =[re.sub("\n", "", x) for x in a]
            if len(a) == 1:
                zero_sn.append(a[0])
            elif len(a) ==2 and a[1] =='NA' :
                failed_sn.append(a[0]) 
            else: 
                u = a[0];        success_sn.append(a[0])
                f = set(a[1:]); d = f - self.friendsCount.keys()
                a = [u]; a.extend(list(f-d))
                adjlist.append(a)    #adjlist is a list[list]
        f.close()   
        if verbose:
            print ("success>0: "+str(len(success_sn)) + ",  failed: " +str(len(failed_sn))+ ", zeros: " + str(len(zero_sn)))
        self.adjlist = adjlist
        
                
    def createEdgelist(self, input_file, output_file, verbose = True):
        """
        create the edgelist from the adjlist --  write out directly to save memory&time
        type: list[list]
        list in side of [a,b] -- edge
        total length of outside list == number of edges
        """
        self.calculateFriendsCount(input_file)
        f = open(input_file, "r")
        success_sn =[];   
        zero_sn = []   #removed those having 0 friends, 
        failed_sn = [] #removed those having 'NA' as friends -- did not download friends list sucessfullly,
        with open(output_file,'w') as out:
            csv_out=csv.writer(out)
            csv_out.writerow(['followers_sn','friends_id_str'])
            for line in f:
                a = line.split(',')
                a =[re.sub("\n", "", x) for x in a]
                if len(a) == 1:
                    zero_sn.append(a[0])
                elif len(a) ==2 and a[1] =='NA':
                    failed_sn.append(a[0]) 
                else: 
                    u = a[0];  success_sn.append(a[0])
                    for i in a[1:]:
                        if i in self.friendsCount:
                            row = [u,i]
                            csv_out.writerow(row)
        f.close()                    
        if verbose:
            print ("success>0: "+str(len(success_sn)) + ",  failed: " +str(len(failed_sn))+ ", zeros: " + str(len(zero_sn)))
    
     
                     

In [2]:
import csv
import time
start_time = time.time()
# your code

input_file = "../../data/friends_info/edgelist_Feb27/originalData/adjlist_1.txt"
output_file = "../../data/friends_info/edgelist_Feb27/originalData/edgelist_1.csv" 
BipGraph = createBipFromFriendlist()
BipGraph.createAdjlist(input_file)
BipGraph.createEdgelist(output_file, threshold = 10)
print("graph1 is done")
'''
success>0: 56237,  failed: 4596, zeros: 53
2866 5000000
5897 10000000
9816 15000000
15987 20000000
21629 25000000
27393 30000000
33460 35000000
40084 40000000
47826 45000000
graph1 is done
'''

success>0: 56303,  failed: 4599, zeros: 53
2902 5000000
5933 10000000
10002 15000000
16341 20000000
21907 25000000
27329 30000000
33700 35000000
40195 40000000
48041 45000000
graph1 is done


'\nsuccess>0: 56237,  failed: 4596, zeros: 53\n2866 5000000\n5897 10000000\n9816 15000000\n15987 20000000\n21629 25000000\n27393 30000000\n33460 35000000\n40084 40000000\n47826 45000000\ngraph1 is done\n'

In [3]:
import csv
import time
start_time = time.time()
# your code

input_file = "../../data/friends_info/edgelist_Feb27/originalData/adjlist_3.txt"
output_file = "../../data/friends_info/edgelist_Feb27/originalData/edgelist_3.csv" 
BipGraph = createBipFromFriendlist()  #clear the previous graph as well
BipGraph.createAdjlist(input_file)
BipGraph.createEdgelist(output_file, threshold = 10)
print("graph3 is done")

success>0: 105777,  failed: 19136, zeros: 46
4664 5000000
10760 10000000
14607 15000000
20060 20000000
26770 25000000
34256 30000000
43989 35000000
54171 40000000
69099 45000000
85527 50000000
graph3 is done


success>0: 105843,  failed: 19147, zeros: 47
4661 5000000
10825 10000000
14643 15000000
20136 20000000
26844 25000000
34246 30000000
43985 35000000
54296 40000000
69169 45000000
85659 50000000
graph1 is done

In [4]:
import csv
import time
start_time = time.time()
# your code

input_file = "../../data/friends_info/edgelist_Feb27/originalData/adjlist_2.txt"
output_file = "../../data/friends_info/edgelist_Feb27/originalData/edgelist_2.csv" 
BipGraph = createBipFromFriendlist()
BipGraph.createAdjlist(input_file)
BipGraph.createEdgelist(output_file, threshold = 10)
print("graph2 is done")

success>0: 162884,  failed: 28775, zeros: 152
4390 5000000
8888 10000000
13735 15000000
18182 20000000
27708 25000000
35489 30000000
44792 35000000
53218 40000000
63866 45000000
76660 50000000
93706 55000000
112350 60000000
133417 65000000
156866 70000000
graph2 is done


In [3]:
import csv
import time
start_time = time.time()
# your code

'''
input_file = "../../data/friends_info/edgelist_Feb27/originalData/adjlist_all.txt"
output_file = "../../data/friends_info/edgelist_Feb27/originalData/edgelist_all.csv" 
BipGraph = createBipFromFriendlist_memEff(threshold = 10)
BipGraph.createEdgelist(input_file, output_file)
print(time.time() +"graph_all is done")
'''

# your code

input_file = "../../data/friends_info/edgelist_Feb27/originalData/adjlist_all.txt"
output_file = "../../data/friends_info/edgelist_Feb27/originalData/edgelist_all.csv" 
BipGraph = createBipFromFriendlist_memEff(threshold = 10)  #clear the previous graph as well
#BipGraph.createAdjlist(input_file)
BipGraph.createEdgelist(input_file= input_file, output_file= output_file)
print("graph_all is done")
## keep running into memeroy issuetm
time.time() - start_time

10000 followers;  number of friends: 14996472
20000 followers;  number of friends: 23318177
30000 followers;  number of friends: 32537352
40000 followers;  number of friends: 39876088
50000 followers;  number of friends: 45570016
60000 followers;  number of friends: 50088950
70000 followers;  number of friends: 54649839
80000 followers;  number of friends: 59567656
90000 followers;  number of friends: 62841831
100000 followers;  number of friends: 66306592
110000 followers;  number of friends: 70268923
120000 followers;  number of friends: 73659945
130000 followers;  number of friends: 76579489
140000 followers;  number of friends: 78888938
150000 followers;  number of friends: 80944693
160000 followers;  number of friends: 83219554
170000 followers;  number of friends: 84977470
170000 followers;  number of friends: 84977470
180000 followers;  number of friends: 86882400
190000 followers;  number of friends: 88702533
200000 followers;  number of friends: 90546058
210000 followers;  num

2892.2150859832764

In [6]:
def calculateFriendsCount(adjlist, threshold = 1):
    """
    rtype: dict()
    key: friend IDs, values, numbers of followers in the graph
    keep the friendIds with >= self.threshold followers
    """
    friendsCount = {}
    for i in range(len(adjlist)):
        line = adjlist[i]
        for id_str in line[1:]:
            if id_str in friendsCount.keys():
                friendsCount[id_str] = friendsCount[id_str] +1
            else:
                friendsCount[id_str] = 1      
                if len(friendsCount) % 1000000 ==0 :
                    print (i, len(friendsCount))
                    print(i, id_str)
    for key in list(friendsCount.keys()):
        if friendsCount[key] < threshold:
            del friendsCount[key]
    return friendsCount 
        
#followerCount = calculateFriendsCount(BipGraph.adjlist, threshold = 5)


In [7]:
def writeEdgeList_csv(self,output_file):
        with open(output_file,'w') as out:
            csv_out=csv.writer(out)
            csv_out.writerow(['follower_id_str','friends_id_str'])
            for row in self.edgelist:
                csv_out.writerow(row)

def createEdgelist(adjlist, friendsCount, output_file, threshold):
    """
    create the edgelist from the adjlist -- update self.edgelist
    type: list[list]
    list in side of [a,b] -- edge
    total length of outside list == number of edges
    """
    with open(output_file,'w') as out:
        csv_out=csv.writer(out)
        csv_out.writerow(['row_id','follower_id_str','friends_id_str'])
        for line in adjlist:
            #a = []
            #a.append(line[0])
            for id_str in line[1:]:
                if id_str in friendsCount.keys():
                    row = (line[0],id_str)
                    csv_out.writerow(row)

#createEdgelist(BipGraph.adjlist, followerCount,  "edgelist.csv", 5)

BipGraph.createEdgelist(threshold = 5)
BipGraph.writeEdgeList_csv(output_file)
elapsed_time = time.time() - start_time
print(elapsed_time)