In [None]:
"""
The central idea behind the project is to research communities on a subset of StackOverFlow Q&A using a network describing interactions between users. It is interesting to answer questions like- What is the modularity of the Network, How discernable are the communities from each other in terms of topics discussed & How do they differ? A hypothesis could be that each community uses primarily their own programming language. We could also google the top 5 authors by score or amount of activity to see their specialization - It can also be interesting to look at score, does the highest scoring post depend on community size and to what degree if? Are some communities better at resolving questions? Look at proportion of closed to non-closed questions.

The dataset we use for this is 10% of StackOverflow Q&A version 2 from 2019 & contains 2~ million~ questions & 1.2~ million answers. Also write features in both in powerpoint.

Dataset is licensed under CC-BY-SA 3.0 with attribution required - meaning it can freely be modified & used both for commercial and research purposes as long as attributed. Top contributors are Miljan Stojiljkovic, Niyamat Ullah, Kartik Garg - See link for rest of contributors.
https://www.kaggle.com/datasets/stackoverflow/stacksample


Outline:
    * Download Dataset. - Done
    * Clean out rows with NaN (Except if the NaN is because a question is still open) and clean out stopwords, links, line breaks, etc.- Done
    * DataScrape to add UserNames/Names using OwnerUserId, requests & stackexchange API: https://api.stackexchange.com/docs
        PS! Since bandwith/quota is not very little compared to dataset size, only top author names are found instead of whole dataset.
        
    * Make Directed Network with Networkx and calculate best community split using Louvain. Nodes are persons and an edge from one person to another means that the first person replies to a question written by the other person. If multiple replies to same question, count this and add both as attribute.
    * WordCloud for best community splits + other diagnostics to analyse social network.

"""

In [15]:
##Loading and Removing NA Rows. Is important to not use for loop, when a solution of constant time complexity exists, when handling data.
## Roll down - Don't run the code in first section, takes too much time. Roll down until clean data is loaded & a subset_dataset is made.
import numpy as np
import threading 
from tqdm import tqdm
import pickle
import pandas as pd
import sklearn
import networkx as nx
from ast import literal_eval
import nltk

In [1]:

answers = '/work3/s204161/comp_social_science_data/Answers.csv'
questions = '/work3/s204161/comp_social_science_data/Questions.csv'
tags = '/work3/s204161/comp_social_science_data/Tags.csv'

answers_df = pd.read_csv(answers, encoding='ISO-8859-1')
questions_df = pd.read_csv(questions, encoding='ISO-8859-1')
tags_df = pd.read_csv(tags)

In [2]:
#clean rows with NA values in them.
clean_answers_df = answers_df.dropna(subset=answers_df.columns, axis=0, how='any')
clean_questions_df = questions_df.dropna(subset=questions_df.columns.difference(['ClosedDate']), axis=0, how='any')
print(f'Answers_df\nSize before: {len(answers_df.index)}\nSize After: {len(clean_answers_df.index)}')
print(f'Questions_df\nSize before: {len(questions_df.index)}\nSize After: {len(clean_questions_df.index)}')

clean_questions_df['OwnerUserId'] = np.asarray(clean_questions_df.get('OwnerUserId'),dtype=int)
clean_answers_df['OwnerUserId'] = np.asarray(clean_answers_df.get('OwnerUserId'),dtype=int)

clean_answers_df.to_csv('/work3/s204161/comp_social_science_data/no_NA_Answers.csv', encoding='ISO-8859-1', index=False)
clean_questions_df.to_csv('/work3/s204161/comp_social_science_data/no_NA_Questions.csv', encoding='ISO-8859-1', index=False)


Answers_df
Size before: 2014516
Size After: 2001316
Questions_df
Size before: 1264216
Size After: 1249762


In [5]:
clean_answers_df

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
2,199,50,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...
3,269,91,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,307,49,2008-08-02T01:49:46Z,260,28,"<p><a href=""http://www.codeproject.com/Article..."
...,...,...,...,...,...,...
2014511,40143247,333403,2016-10-19T23:42:35Z,40143190,0,"<p>Tanks to <a href=""http://stackoverflow.com/..."
2014512,40143322,642706,2016-10-19T23:50:35Z,40137110,1,<h1>tl;dr</h1>\n\n<pre><code>ZonedDateTime.par...
2014513,40143336,2239781,2016-10-19T23:52:08Z,40141860,0,<p>I came up with a very dirty workaround. Bef...
2014514,40143349,6934347,2016-10-19T23:54:02Z,40077010,0,<p>I solved my own problem defining the follow...


In [26]:
np.sort(clean_answers_df['ParentId'])

dtype('int64')

In [42]:
unique_users = np.unique(np.asarray(np.append(np.append(clean_questions_df.get('OwnerUserId'),clean_answers_df.get('OwnerUserId')),clean_answers_df.get('ParentId')),dtype=int)) ## This number is wrong, parentId should not be included.
print(f'unique_nodes: {len(unique_users)}')

unique_nodes: 1980582


In [28]:
np.append(clean_questions_df.get('OwnerUserId'),clean_answers_df.get('OwnerUserId')),clean_answers_df.get('ParentId')

0              177.0
1              164.0
2              313.0
3          2090091.0
4              400.0
             ...    
2014511          NaN
2014512          NaN
2014513          NaN
2014514          NaN
2014515          NaN
Length: 2011725, dtype: float64

In [38]:
np.append(np.append(clean_questions_df.get('OwnerUserId'),clean_answers_df.get('OwnerUserId')),clean_answers_df.get('ParentId'))

array([      26,       58,       83, ..., 40141860, 40077010, 40142910])

In [34]:
import networkx as nx
G = nx.DiGraph()
G.add_nodes_from(unique_user_ids)

In [45]:
import string
import nltk
from bs4 import BeautifulSoup
#nltk.download('stopwords')
#nltk.download('punkt')

tok_list = []
bad_tokens = set(['https:', 'http:', 'www.']) | set(nltk.corpus.stopwords.words('english')) | set(string.punctuation)
bad_tokens_2 = set(string.punctuation) | set(['[',']','(',')','{','}','<','>','/','\\','|',')',']'])

for i in range(len(clean_questions_df.get('OwnerUserId'))):
    body_text = clean_questions_df.iloc[i]['Body']
    clean_body_text = BeautifulSoup(body_text, 'html.parser').get_text()
    #print(clean_body_text) 321
    if i % 1000 == 0:
       print(i)
    tokens = nltk.word_tokenize(clean_body_text)
    tokens = [token.lower() for token in tokens]
    remove_indexes = []
    for i in range(len(tokens)):
        #remove bad tokens
        if any([(tokens[i] == bad_token) for bad_token in bad_tokens]):
            remove_indexes.append(i)
        elif any([(bad_token in tokens[i]) for bad_token in bad_tokens_2]):
            remove_indexes.append(i)
        elif tokens[i].isnumeric():
            remove_indexes.append(i)            
    
    for i in range(len(remove_indexes) - 1, -1, -1):
        remove_index = remove_indexes[i]
        del tokens[remove_index]

    if len(tokens) == 1:
        tokens.remove(tokens[0])

        #Maybe do biagrams, if not too computationally demanding, to get some temporal context
    tok_list.append(tokens)


clean_questions_df['tokens'] = tok_list

clean_questions_df.to_csv('/work3/s204161/comp_social_science_data/with_token_Questions.csv', encoding='ISO-8859-1', index=False)

#This takes some time to run, so is interrupted and ran overnight.

0
1000


KeyboardInterrupt: 

In [36]:
from bs4 import BeautifulSoup
owner_attrs = {}

owner_found_bool = {}
#Add question/owner attributes to nodes:
for i in range(len(clean_questions_df.get('OwnerUserId'))):
    question_id = str(int(clean_questions_df.iloc[i].get('Id')))
    owner_id = str(int(clean_questions_df.iloc[i].get('OwnerUserId')))
    if not owner_found_bool.get(owner_id):
        owner_found_bool[owner_id] = True
        owner_attrs[owner_id] = {}
        owner_attrs[owner_id]['scores'] = {}
    owner_attrs[owner_id][question_id] = BeautifulSoup(clean_questions_df.iloc[i].get('Body'), 'html.parser').get_text()
    owner_attrs[owner_id]['scores'][question_id] = (clean_questions_df.iloc[i].get('Score'))    
    #Should also save question title in graph - implement here:
    

    if i % 50000 == 0:
        print(i)


0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000


In [41]:
answers_attrs = {}
answers_found_bool = {}

collab_network = {}

#Add answer attributes to nodes & remember that edge is directed from OwnerUserId --> ParentId:
for i in range(len(clean_answers_df.get('OwnerUserId'))):
    answers_id = str(int(clean_answers_df.iloc[i].get('Id')))
    owner_id = str(int(clean_answers_df.iloc[i].get('OwnerUserId')))
    parent_id = str(int(clean_answers_df.iloc[i].get('ParentId')))
    if not answers_found_bool.get(owner_id):
        answers_found_bool[owner_id] = True
        answers_attrs[owner_id] = {}
    if not owner_found_bool.get(owner_id):
        owner_found_bool[owner_id] = True
        owner_attrs[owner_id] = {}
        owner_attrs[owner_id]['scores'] = {}

    owner_attrs[owner_id][answers_id] = BeautifulSoup(clean_answers_df.iloc[i].get('Body'), 'html.parser').get_text()
    
    if collab_network.get((owner_id, parent_id)) == None:
        collab_network[(owner_id, parent_id)] = 1
    else:
        collab_network[(owner_id, parent_id)] += 1

    #we want to find scores for each person for answers too.
    owner_attrs[owner_id]['scores'][answers_id] = clean_answers_df.iloc[i].get('Score')
    if i % 50000 == 0:
        print(i)

nx.set_node_attributes(G, answers_attrs, 'Questions')

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000
1250000
1300000
1350000
1400000
1450000
1500000
1550000
1600000
1650000
1700000
1750000
1800000
1850000
1900000
1950000
2000000


In [144]:
for owner_id in unique_user_ids:
    if owner_attrs.get(owner_id):
        owner_attrs[owner_id]['best_score'] = np.max(list(owner_attrs[owner_id]['scores'].values()))
        owner_attrs[owner_id]['avg_score'] = np.mean(np.asarray(list(owner_attrs[owner_id]['scores'].values()),dtype=float))
        owner_attrs[owner_id]['median_score'] = np.median(list(owner_attrs[owner_id]['scores'].values()))


nx.set_node_attributes(G, owner_attrs, 'Questions')

In [63]:
#make edgelist and then edges in network.
edgelist = [(x[0][0], x[0][1], x[1]) for x in collab_network.items()]

G.add_weighted_edges_from(edgelist)


In [162]:
import pickle
#Save Graph
with open('/work3/s204161/comp_social_science_data/stackoverflow_network.pickle', 'wb') as f:
    pickle.dump(G, f, protocol=pickle.HIGHEST_PROTOCOL)
#Graph is big, so check Google Drive Link instead for both social network and csv files: 
# https://drive.google.com/drive/folders/11gTEA4omR2T6JZRMFCww7e2B1DcoqpmO?usp=share_link

In [343]:
print(f'Number of nodes in G: {G.number_of_nodes()}')
print(f'Number of edges in G: {G.number_of_edges()}')
print(f'Edges for node 61 in G: {G.edges(str(81))}')
print(f'Edges for node 61 in G: {G.edges(str(61))}') # This user answers their own questions (and prolly closes the thread)

#print(f'Info about Node 61 in G: {G.nodes[str(61)]}')


Number of nodes in G: 30133
Number of edges in G: 80670
Edges for node 61 in G: [('81', '71422')]
Edges for node 61 in G: [('61', '61')]


In [None]:
# Code below would find username for all users in dataset. Due to Quotas it is not possible however, so after we find top authors in our best community-split, we can request some of the top author names using stackexchange API.

In [105]:

# Since bandwith/quota is not very little compared to dataset size, only top author names are found instead of whole dataset. This code should be applied on top authors only then.
import requests
import time
remaining_users = []
username_dict = {}
for user_index in range(0,len(unique_user_ids),100):
    unsuccesful_tries = 0
    url = f'https://api.stackexchange.com/2.3/users/{";".join(map(str, unique_user_ids[user_index:user_index + 100]))}?site=meta.stackoverflow'
    #url = f'https://api.stackexchange.com/2.3/users/{61}?site=meta.stackoverflow'
    response = requests.get(url)
    while response.status_code != 200:
        print(f'Failed with status code: {response.status_code}')
        time.sleep(5)
        if unsuccesful_tries == 3:
            remaining_users.append(unique_user_ids[user_index:user_index + 100])
            continue

    #data = response.json()['items'][0]
    #username_dict[str(user)] = data['display_name']
    print('break for test')
    break
    

break for test


In [118]:
set1 = set(np.asarray(clean_answers_df.get('Id'),dtype=str))
set2 = set(np.asarray(clean_questions_df.get('Id'),dtype=str))
#Id is unique in whole dataset/on stackoverflow page - is not shared between answers & questions. thats why intersection is empty
set1.intersection(set2)


set()

In [13]:
import pickle
import networkx as nx
#Save Graph
with open('/work3/s204161/comp_social_science_data/stackoverflow_network.pickle', 'rb') as f:
    G = pickle.load(f)
#Graph is big, so check Google Drive Link instead for both social network and csv files: 
# https://drive.google.com/drive/folders/11gTEA4omR2T6JZRMFCww7e2B1DcoqpmO?usp=share_link

In [25]:
seen_nodes = {}

for node in G.nodes:
    if seen_nodes.get(int(node)):
        print('fucker is seen >:-(', node)
    seen_nodes[node] = True

In [None]:
######################
# NEW CODE STARTS HERE using full tokenized answers & questions to make subset_data
######################

In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval

answers = '/work3/s204161/comp_social_science_data/with_token_Answers.csv'
questions = '/work3/s204161/comp_social_science_data/with_token_Questions.csv'
questions_df = pd.read_csv(questions, encoding='utf-8')
answers_df = pd.read_csv(answers, encoding='utf-8')


In [2]:
answers_df.tokens = answers_df.tokens.apply(lambda x: literal_eval(str(x)))

In [213]:
#subset by Time
subset_questions_df = questions_df[questions_df.CreationDate.str[:4].astype('int') == 2009]
subset_answers_df = answers_df[(2010 > answers_df.CreationDate.str[:4].astype('int')) & (answers_df.CreationDate.str[:4].astype('int') >= 2009)]


In [215]:
#make dict to ensure linear time instead of quadratic
subset_question_dict = {}
for ques in list(subset_questions_df.Id):
    subset_question_dict[ques] = True

In [216]:
import tqdm
for ans_index in tqdm.tqdm(range(len(subset_answers_df)-1, -1,-1)):
    parid = subset_answers_df.iloc[ans_index].ParentId
    if not subset_question_dict.get(parid):
        subset_answers_df.drop(subset_answers_df.index[ans_index], inplace=True)
        #question_dict.get(subset_answers_df.iloc[ans_index].ParentId)


100%|██████████| 92887/92887 [01:36<00:00, 959.57it/s]


In [310]:
print('question_df_length:',len(subset_questions_df))
print('answers_df_length:',len(answers_df[(2010 > answers_df.CreationDate.str[:4].astype('int')) & (answers_df.CreationDate.str[:4].astype('int') >= 2009)]))
print('answers_df_length_after_removing answers to questions from 2008 or previous years:', len(subset_answers_df))

question_df_length: 31088
answers_df_length: 92887
answers_df_length_after_removing answers to questions from 2008 or previous years: 82497


In [322]:
unique_users = np.unique(np.asarray(np.asarray(np.append(subset_questions_df.get('OwnerUserId'),subset_answers_df.get('OwnerUserId')),dtype=int),dtype=str))
print(f'unique_nodes: {len(unique_users)}')


unique_nodes: 30133


In [31]:
import string
import nltk
from bs4 import BeautifulSoup
def tokenize_col(dat_frame_df,colname, calculate_bigrams):
    nltk.download('stopwords')
    nltk.download('punkt')

    #tok_list = []
    bad_tokens = set(['https:', 'http:', 'www.']) | set(nltk.corpus.stopwords.words('english')) | set(string.punctuation)
    bad_tokens_2 = set(string.punctuation) | set(['[',']','(',')','{','}','<','>','/','\\','|',')',']'])

    tok_list = []
    bi_list = []
    for i in range(len(dat_frame_df.get('OwnerUserId'))):
        body_text = dat_frame_df.iloc[i][colname]
        clean_body_text = BeautifulSoup(body_text, 'html.parser').get_text()
        #print(clean_body_text) 321
        if i % 5000 == 0:
            print(i)
        tokens = nltk.word_tokenize(clean_body_text)
        tokens = [token.lower() for token in tokens]
        remove_indexes = []
        for i in range(len(tokens)):
            #remove bad tokens
            if any([(tokens[i] == bad_token) for bad_token in bad_tokens]):
                remove_indexes.append(i)
            elif any([(bad_token in tokens[i]) for bad_token in bad_tokens_2]):
                remove_indexes.append(i)
            elif tokens[i].isnumeric():
                remove_indexes.append(i)            
        
        for i in range(len(remove_indexes) - 1, -1, -1):
            remove_index = remove_indexes[i]
            del tokens[remove_index]

        if len(tokens) == 1:
            tokens.remove(tokens[0])
            #Maybe do biagrams, if not too computationally demanding, to get temporal context
        
        if calculate_bigrams:
            bi_grams = list(nltk.bigrams(tokens))
            bi_list.append(bi_grams)
        tok_list.append(tokens)

    return tok_list if not calculate_bigrams else tok_list, bi_list



In [45]:
#subset_questions_df = subset_questions_df.drop(columns='tokens')
#subset_answers_df = subset_answers_df.drop(columns='tokens')
#subset_questions_df = subset_questions_df.drop(columns='TitleTokens')
#subset_questions_df

In [46]:

subset_questions_df['TitleTokens'], subset_questions_df['TitleBigrams']  = tokenize_col(subset_questions_df,colname='Title',calculate_bigrams=True)

subset_questions_df['Tokens'], subset_questions_df['TokensBigrams']  = tokenize_col(subset_questions_df,colname='Body',calculate_bigrams=True)

subset_answers_df['Tokens'], subset_answers_df['TokensBigrams']  = tokenize_col(subset_answers_df,colname='Body',calculate_bigrams=True)



[nltk_data] Downloading package stopwords to
[nltk_data]     /zhome/a7/0/155527/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /zhome/a7/0/155527/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
0
5000
10000
15000
20000
25000
30000
[nltk_data] Downloading package stopwords to
[nltk_data]     /zhome/a7/0/155527/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /zhome/a7/0/155527/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
0
5000
10000
15000
20000
25000
30000
[nltk_data] Downloading package stopwords to
[nltk_data]     /zhome/a7/0/155527/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /zhome/a7/0/155527/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
5

In [47]:
subset_questions_df.to_csv('/work3/s204161/comp_social_science_data/subset_questions_c.csv', encoding='utf-8', index=False)
subset_answers_df.to_csv('/work3/s204161/comp_social_science_data/subset_answers_c.csv', encoding='utf-8', index=False)


In [348]:
#pd.set_option('display.max_rows', 5)
subset_answers_df

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body,Tokens,TokensBigrams
0,404434,1288.0,2009-01-01T02:46:21Z,404430,6,<p>The most common use cases are to find strin...,"[common, use, cases, find, strings, match, pat...","[(common, use), (use, cases), (cases, find), (..."
1,404436,20029.0,2009-01-01T02:46:58Z,404430,1,<p>Stack Overflow is in fact a good place to f...,"[stack, overflow, fact, good, place, find, use...","[(stack, overflow), (overflow, fact), (fact, g..."
...,...,...,...,...,...,...,...,...
82495,1987478,31668.0,2009-12-31T23:29:25Z,1987470,1,<p>I would recommend first getting a list of a...,"[would, recommend, first, getting, list, eleme...","[(would, recommend), (recommend, first), (firs..."
82496,2274531,165358.0,2009-10-26T15:24:46Z,2274530,1,<p>Does this Microsoft Support article help:<b...,"[microsoft, support, article, help, migrate, v...","[(microsoft, support), (support, article), (ar..."


In [329]:
import networkx as nx
G = nx.DiGraph()
G.add_nodes_from(unique_users)

In [330]:
import string
import nltk
from bs4 import BeautifulSoup
quest_attrs = {}

quest_found_bool = {}
#Add question/owner attributes to nodes:
for i in range(len(subset_questions_df.get('OwnerUserId'))):
    question_id = str(subset_questions_df.iloc[i].get('Id'))
    owner_id = str(int(subset_questions_df.iloc[i].get('OwnerUserId')))
    #If first time user is asking, create dict
    if not quest_found_bool.get(owner_id):
        quest_found_bool[owner_id] = True
        quest_attrs[owner_id] = {}
        quest_attrs[owner_id]['Scores'] = {}
    #owner_attrs[owner_id][question_id] = BeautifulSoup(subset_questions_df.iloc[i].get('Body'), 'html.parser').get_text()
    quest_attrs[owner_id]['Scores'][question_id] = (subset_questions_df.iloc[i].get('Score'))    
    #Should also save question title in graph - implement here:

    if i % 10000 == 0:
        print(i)
nx.set_node_attributes(G, quest_attrs, 'Questions')

0
10000
20000
30000


In [338]:
answers_attrs = {}
answers_found_bool = {}

collab_network = {}

#Add answer attributes to nodes & remember that edge is directed from OwnerUserId --> OwnerUserId (parent):
for i in range(len(subset_answers_df.get('OwnerUserId'))):
    answers_id = str(subset_answers_df.iloc[i].get('Id'))
    owner_id = str(int(subset_answers_df.iloc[i].get('OwnerUserId')))
    post_parent_id = str(int(subset_answers_df.iloc[i].get('ParentId')))
    owner_parent_id = str(int(subset_questions_df[subset_questions_df.Id == int(post_parent_id)].OwnerUserId))
    #if first time user is answering create dict
    if not answers_found_bool.get(owner_id):
        answers_found_bool[owner_id] = True
        answers_attrs[owner_id] = {}
        answers_attrs[owner_id]['Scores'] = {}
    #owner_attrs[owner_id][answers_id] = BeautifulSoup(clean_answers_df.iloc[i].get('Body'), 'html.parser').get_text()

    if not collab_network.get((owner_id, owner_parent_id)):
        collab_network[(owner_id, owner_parent_id)] = 1
    else:
        collab_network[(owner_id, owner_parent_id)] += 1

    #we want to find scores for each person for answers too.
    answers_attrs[owner_id]['Scores'][answers_id] = subset_answers_df.iloc[i].get('Score')
    if i % 10000 == 0:
        print(i)

nx.set_node_attributes(G, answers_attrs, 'Answers')

0
10000
20000
30000
40000
50000
60000
70000
80000


In [293]:

#Should be updated if calculate for best, avg. median...
# for owner_id in unique_users:
#     if owner_attrs.get(owner_id):
#         owner_attrs[owner_id]['BestScore'] = np.max(list(owner_attrs[owner_id]['Scores'].values()))
#         owner_attrs[owner_id]['AvgScore'] = np.mean(np.asarray(list(owner_attrs[owner_id]['scores'].values()),dtype=float))
#         owner_attrs[owner_id]['MedianScore'] = np.median(list(owner_attrs[owner_id]['scores'].values()))


# nx.set_node_attributes(G, owner_attrs, 'Questions')



In [339]:
#make edgelist and then edges in network.
edgelist = [(x[0][0], x[0][1], x[1]) for x in collab_network.items()]

G.add_weighted_edges_from(edgelist)


In [362]:
import pickle
with open('/work3/s204161/comp_social_science_data/stackoverflow_subset_network.pickle', 'wb') as f:
    pickle.dump(G, f, protocol=pickle.HIGHEST_PROTOCOL)

In [340]:
G.in_edges('404430')

InEdgeDataView([('31641', '4'), ('43219', '4'), ('40650', '4'), ('4', '4'), ('130812', '4')])

In [348]:
G.nodes['85162'] #some users are only active once a year, so subsetting by "time" will not encapsulate all posts made by a person who is active throughout many years. maybe do it by tag instead?

{'Answers': {'Scores': {'701489': 0, '701781': 3}}}

In [None]:
#####################
### After making subset_dataset, we request information about authors in this subset.
##################### 

In [68]:
import pandas as pd
import numpy as np
from ast import literal_eval

subset_questions_df = pd.read_csv('/work3/s204161/comp_social_science_data/subset_questions_c.csv', encoding='utf-8')
subset_answers_df = pd.read_csv('/work3/s204161/comp_social_science_data/subset_answers_c.csv', encoding='utf-8')

subset_answers_df.Tokens = subset_answers_df.Tokens.apply(lambda x: literal_eval(str(x)))
subset_answers_df.TokensBigrams = subset_answers_df.TokensBigrams.apply(lambda x: literal_eval(str(x)))

subset_questions_df.Tokens = subset_questions_df.Tokens.apply(lambda x: literal_eval(str(x)))
subset_questions_df.TokensBigrams = subset_questions_df.TokensBigrams.apply(lambda x: literal_eval(str(x)))
subset_questions_df.TitleTokens = subset_questions_df.TitleTokens.apply(lambda x: literal_eval(str(x)))
subset_questions_df.TitleBigrams = subset_questions_df.TitleBigrams.apply(lambda x: literal_eval(str(x)))


In [None]:
unique_users = np.unique(np.asarray(np.asarray(np.append(subset_questions_df.get('OwnerUserId'),subset_answers_df.get('OwnerUserId')),dtype=int),dtype=str))
print(len(unique_users))
first_batch = unique_users[:29400]
print(len(first_batch))
second_batch = unique_users[29400:]
assert len(second_batch) + len(first_batch) == len(unique_users)

In [7]:

# Since bandwith/quota is not very little compared to dataset size, only top author names are found instead of whole dataset. This code should be applied on top authors only then.
import requests
import time
params = {
    'pagesize': 100,
    'page': 1,
}
remaining_users = []
all_data = []
username_dict = {}
for user_index in range(29400,30200,100):
    unsuccesful_tries = 0
    url = f'https://api.stackexchange.com/2.3/users/{";".join(map(str, unique_users[user_index:user_index + 100]))}?site=meta.stackoverflow'
    #url = f'https://api.stackexchange.com/2.3/users/{61}?site=meta.stackoverflow'
    response = requests.get(url,params = params)
    while response.status_code != 200:
        print(f'Failed with status code: {response.status_code}')
        time.sleep(5)
        if unsuccesful_tries == 3:
            remaining_users.append(unique_users[user_index:user_index + 100])
            continue
    time.sleep(3)
    if response.json()['has_more']:
        print(f'oh no, there was more data not fetched, at {user_index} to {user_index + 100}')
    data = response.json()['items']

    for data_point in data:
        all_data.append(data_point)

    #username_dict[str(user)] = data['display_name']
    print(user_index)

29400
29500
29600
29700
29800
29900
30000
30100


In [337]:
import json
with open('/work3/s204161/comp_social_science_data/first_part_response_0-29400.json') as f:
    part_1 = json.load(f)
with open('/work3/s204161/comp_social_science_data/second_part_response_29400-30133.json') as f:
    part_2 = json.load(f)

response_data = part_1 + part_2
print(len(response_data))


17327


In [340]:
# Only about half of these users still exist - Their account must have been deleted:
### For example:
# https://meta.stackoverflow.com/users/100027/ Exists.
# https://meta.stackoverflow.com/users/100157/ Does not exist
print(f'47 or so of these dont exist, \n {unique_users[0:100]}')

47 or so of these dont exist, 
 ['1' '100' '1000' '100004' '100007' '100008' '100014' '100017' '100020'
 '100027' '10004' '100040' '100043' '100066' '10007' '100073' '100080'
 '100089' '100091' '100095' '100110' '100116' '10012' '100128' '100135'
 '100142' '100146' '100157' '10016' '100160' '100165' '100170' '100171'
 '100175' '10018' '100184' '100186' '100187' '100190' '100192' '100203'
 '100208' '100213' '100217' '100223' '100237' '100238' '100240' '100253'
 '100259' '10026' '100261' '100262' '100265' '100272' '100288' '100297'
 '100300' '100302' '100306' '10031' '100322' '100334' '10034' '100347'
 '100358' '100363' '100385' '10039' '10040' '100408' '100426' '100429'
 '100431' '100443' '100450' '100458' '10046' '100466' '10047' '100472'
 '100473' '100478' '100488' '1005' '100506' '100507' '100516' '100519'
 '100527' '100531' '100534' '100552' '100567' '100572' '100577' '100589'
 '100596' '100598' '100609']


In [478]:
# Pull out the values in response data we are interested in.
tot_users_attrs = {}
for user in response_data:
    user_attrs = {}
    user_id = str(user['user_id'])
    assert isinstance(user_id, str)
    user_attrs['user_name'] = user['display_name']
    user_attrs['user_type'] = user['user_type']
    user_attrs['location'] = user.get('location')
    user_attrs['reputation_today'] = user['reputation']
    user_attrs['badge_counts_today'] = user['badge_counts']

    tot_users_attrs[user_id] = user_attrs

#Add responses to network as node attributes.
import pickle
import community as community_louvain
nx.set_node_attributes(G, tot_users_attrs)

#save network w. response attrs.
with open(r'/work3/s204161/comp_social_science_data/stackoverflow_subset_network.pickle', 'wb') as f:
    pickle.dump(G, f, protocol=pickle.HIGHEST_PROTOCOL)

G.nodes['100297'] #example node

{'Answers': {'Scores': {'1041206': 3,
   '1052993': 6,
   '1126074': 69,
   '1164126': 1,
   '1226761': 2}},
 'user_name': 'Martijn Pieters',
 'user_type': 'moderator',
 'location': 'Cambridge, UK',
 'reputation_today': 1027591,
 'badge_counts_today': {'bronze': 754, 'silver': 922, 'gold': 96}}

In [None]:
############### Wordclouds

In [5]:
import networkx as nx
import community as community_louvain
import pickle
with open(r'/work3/s204161/comp_social_science_data/stackoverflow_subset_network_old.pickle', 'rb') as f:
    G = pickle.load(f)
G_un = G.to_undirected()
best_partition = community_louvain.best_partition(G_un)
num_communities = len(np.unique(list(best_partition.values()))) #Number of communities
print(f'num_communities: {num_communities}')
communities = {}
for num in range(num_communities):
    communities[num] = []

for node in best_partition:
    communities[best_partition[node]].append(node)


num_communities: 1082


In [353]:
# c = community to extract tokens/bigrams for
def extract_community_tokens(communities, c, answers_df, questions_df, collect_answers_and_questions = True, multiply_title_weight = 1):
    sema.acquire()
    assert isinstance(multiply_title_weight,int)
    assert (multiply_title_weight > 0)
    answer_tokens, community_questions,community_answers = [],[],[]

    for user in communities[c]:

        user_questions = G.nodes[user].get('Questions')
        user_answers = G.nodes[user].get('Answers')
        if user_questions:
            user_question_ids = list(user_questions['Scores'].keys())
            community_questions += user_question_ids            
        if user_answers:
            user_answers_ids = list(user_answers['Scores'].keys())
            community_answers += user_answers_ids
        
    answers_tok = subset_answers_df[subset_answers_df['Id'].astype(str).isin(community_answers)].Tokens.tolist() 
    answers_bi = subset_answers_df[subset_answers_df['Id'].astype(str).isin(community_answers)].TokensBigrams.tolist() 
    questions_tok = subset_questions_df[subset_questions_df['Id'].astype(str).isin(community_questions)].Tokens.tolist()
    questions_bi = subset_questions_df[subset_questions_df['Id'].astype(str).isin(community_questions)].TokensBigrams.tolist()
    questions_title_tok = subset_questions_df[subset_questions_df['Id'].astype(str).isin(community_questions)].TitleTokens.tolist()
    questions_title_bigrams = subset_questions_df[subset_questions_df['Id'].astype(str).isin(community_questions)].TitleBigrams.tolist()
    
    answers_tok, answers_bi = [item for sublist in answers_tok for item in sublist], [item for sublist in answers_bi for item in sublist] 
    questions_tok, questions_bi, questions_title_tok, questions_title_bigrams = [item for sublist in questions_tok for item in sublist], [item for sublist in questions_bi for item in sublist], [item for sublist in questions_title_tok for item in sublist], [item for sublist in questions_title_bigrams for item in sublist] 

    multiply_title_weight -= 1
    while multiply_title_weight > 0:
        questions_title_tok.append(questions_title_tok)
        questions_title_bigrams.extend(questions_title_bigrams)
        multiply_title_weight -= 1

    all_tokens, all_bigrams = [],[]

    if collect_answers_and_questions:
        all_tokens.extend(questions_title_tok)
        all_tokens.extend(questions_tok)
        all_tokens.extend(answers_tok)
        
        all_bigrams.extend(questions_title_bigrams)
        all_bigrams.extend(questions_bi)
        all_bigrams.extend(answers_bi)
    else:
        questions_title_tok.extend(questions_tok)
        all_tokens.append(questions_title_tok)
        all_tokens.append(answers_tok)

        questions_title_bigrams.extend(questions_bi)
        all_bigrams.append(questions_title_bigrams)
        all_bigrams.append(answers_bi)

    with thread_lock:
        tok_arrays[c], bigram_arrays[c] = all_tokens, all_bigrams
    sema.release()
    return all_tokens, all_bigrams


In [354]:
# Info about multiprocessing and threading in python: https://medium.com/python-experiments/parallelising-in-python-mutithreading-and-mutiprocessing-with-practical-templates-c81d593c1c49

sema = threading.Semaphore(value=16) #set maxthreads to to avoid out of memory error.

tok_arrays, bigram_arrays, thread_list = {}, {}, [] #thread_list used for multithreading
thread_lock = threading.Lock()
for c in communities:
    t = threading.Thread(target=extract_community_tokens, args=(communities, c, subset_answers_df, subset_questions_df,True))
    thread_list.append(t)
# start the threads
for thread in tqdm(thread_list):
    thread.start()
# wait for all threads to complete
for thread in thread_list:
    thread.join()

community_df = pd.DataFrame([tok_arrays]).T
community_df.columns = ['AllTokens']
community_df['AllBigrams'] = bigram_arrays

#Calculate again, but now using collect_answers_and_questions=False so tokens and bigrams are calculated for answers and questions.
tok_arrays, bigram_arrays, thread_list = {}, {}, []
thread_lock = threading.Lock()
for c in communities:
    t = threading.Thread(target=extract_community_tokens, args=(communities, c, subset_answers_df, subset_questions_df,False))
    thread_list.append(t)

for thread in tqdm(thread_list):
    thread.start()
for thread in thread_list:
    thread.join()

print('tok_array returned as [questions, answers]')        
community_df['QuestionsTokens'] = [tok_arrays[x][0] for x in tok_arrays]
community_df['AnswersTokens'] = [tok_arrays[x][1] for x in tok_arrays]
community_df['QuestionsBigrams'] = [bigram_arrays[x][0] for x in bigram_arrays]
community_df['AnswersBigrams'] = [bigram_arrays[x][1] for x in bigram_arrays]

#Multiply occurences in Question Title with alpha - weight Question Titles higher in TF-IDF.
alpha = 3 

tok_arrays, bigram_arrays, thread_list = {}, {}, []
thread_lock = threading.Lock()
for c in communities:
    t = threading.Thread(target=extract_community_tokens, args=(communities, c, subset_answers_df, subset_questions_df,True,alpha))
    thread_list.append(t)

for thread in tqdm(thread_list):
    thread.start()
for thread in thread_list:
    thread.join()


community_df_a3 = pd.DataFrame([tok_arrays]).T
community_df_a3.columns = ['AllTokens']
community_df_a3['AllBigrams'] = bigram_arrays

tok_arrays, bigram_arrays, thread_list = {}, {}, []
thread_lock = threading.Lock()
for c in communities:
    t = threading.Thread(target=extract_community_tokens, args=(communities, c, subset_answers_df, subset_questions_df,False,alpha))
    thread_list.append(t)

for thread in tqdm(thread_list):
    thread.start()
for thread in thread_list:
    thread.join()

print('tok_array returned as [questions, answers]')        
community_df_a3['QuestionsTokens_alpha_3'] = [tok_arrays[x][0] for x in tok_arrays]
community_df_a3['AnswersTokens_alpha_3'] = [tok_arrays[x][1] for x in tok_arrays]
community_df_a3['QuestionsBigrams_alpha_3'] = [bigram_arrays[x][0] for x in bigram_arrays]
community_df_a3['AnswersBigrams_alpha_3'] = [bigram_arrays[x][1] for x in bigram_arrays]


100%|██████████| 1045/1045 [02:35<00:00,  6.71it/s]
100%|██████████| 1045/1045 [02:36<00:00,  6.69it/s]
tok_array returned as [questions, answers]
100%|██████████| 1045/1045 [02:38<00:00,  6.57it/s]
100%|██████████| 1045/1045 [02:39<00:00,  6.55it/s]
tok_array returned as [questions, answers]


In [384]:
#count number of members in each community using np.unique()
indexes, counts = np.unique(list(best_partition.values()), return_counts=True)
community_df['community_size'] = [0]*len(indexes)
community_df_a3['community_size'] = [0]*len(indexes)
for i in indexes:
    community_df['community_size'].loc[i] = counts[i]
    community_df_a3['community_size'].loc[i] = counts[i]
    

community_df.to_csv(r'/work3/s204161/comp_social_science_data/community_df.csv', index = False)
community_df_a3.to_csv(r'/work3/s204161/comp_social_science_data/community_df_alpha3.csv', index = False)




In [6]:
community_df = pd.read_csv('/work3/s204161/comp_social_science_data/community_df.csv', encoding='utf-8')
community_df_a3 = pd.read_csv('/work3/s204161/comp_social_science_data/community_df_alpha3.csv', encoding='utf-8')

community_df.AllTokens = community_df.AllTokens.apply(lambda x: literal_eval(str(x)))
community_df.AllBigrams = community_df.AllBigrams.apply(lambda x: literal_eval(str(x)))

community_df_a3.AllTokens = community_df_a3.AllTokens.apply(lambda x: literal_eval(str(x)))
community_df_a3.AllBigrams = community_df_a3.AllBigrams.apply(lambda x: literal_eval(str(x)))

len(communities)

1082

In [11]:
community_df

Unnamed: 0,AllTokens,AllBigrams,QuestionsTokens,AnswersTokens,QuestionsBigrams,AnswersBigrams,community_size
0,"['add', 'bottom', 'padding', 'div', 'contains'...","[('add', 'bottom'), ('bottom', 'padding'), ('p...","['add', 'bottom', 'padding', 'div', 'contains'...","['box', 'model', 'hack', 'basically', 'providi...","[('add', 'bottom'), ('bottom', 'padding'), ('p...","[('box', 'model'), ('model', 'hack'), ('hack',...",871
1,"['rectangular', 'arrays', 'access', 'loop', 'r...","[('rectangular', 'arrays'), ('arrays', 'access...","['rectangular', 'arrays', 'access', 'loop', 'r...","['benchmark', 'results', 'access', 'arr1', 'th...","[('rectangular', 'arrays'), ('arrays', 'access...","[('benchmark', 'results'), ('results', 'access...",125
2,"['segfault', 'adding', 'variable', 'linq', 'sq...","[('segfault', 'adding'), ('adding', 'variable'...","['mime', 'type', 'json', 'returned', 'rest', '...","['common', 'use', 'cases', 'find', 'strings', ...","[('mime', 'type'), ('type', 'json'), ('json', ...","[('common', 'use'), ('use', 'cases'), ('cases'...",116
3,"['make', 'jquery', 'modify', 'one', 'div', 'in...","[('make', 'jquery'), ('jquery', 'modify'), ('m...","['segfault', 'adding', 'variable', 'linq', 'sq...","['used', 'virtualization', 'approach', 'using'...","[('segfault', 'adding'), ('adding', 'variable'...","[('used', 'virtualization'), ('virtualization'...",1763
4,"['xslt', 'buddy', 'available', 'somewhere', 's...","[('xslt', 'buddy'), ('buddy', 'available'), ('...","['cocoa', 'wo', 'capture', 'shift', 'modifier'...","['years', 'ago', 'iphone', 'web', 'browsers', ...","[('cocoa', 'wo'), ('wo', 'capture'), ('capture...","[('years', 'ago'), ('ago', 'iphone'), ('iphone...",300
...,...,...,...,...,...,...,...
1040,"['magento', 'country', 'codes', 'table', 'rate...","[('magento', 'country'), ('country', 'codes'),...","['use', 'panel', 'region', 'prism', 'prism', '...","['answer', 'found', 'nice', 'descriptive', 'bl...","[('use', 'panel'), ('panel', 'region'), ('regi...","[('answer', 'found'), ('found', 'nice'), ('nic...",1
1041,"['use', 'panel', 'region', 'prism', 'prism', '...","[('use', 'panel'), ('panel', 'region'), ('regi...","['magento', 'country', 'codes', 'table', 'rate...","['appears', 'answers', 'lie', 'following', 'fi...","[('magento', 'country'), ('country', 'codes'),...","[('appears', 'answers'), ('answers', 'lie'), (...",1
1042,"['work', 'canvas', 'jcanvas', 'netbeans', 'net...","[('work', 'canvas'), ('canvas', 'jcanvas'), ('...","['use', 'extensibility', 'dll', 'designer', 's...","['asked', 'different', 'question', 'bit', 'foc...","[('use', 'extensibility'), ('extensibility', '...","[('asked', 'different'), ('different', 'questi...",1
1043,"['soap', 'formatting', 'problems', 'change', '...","[('soap', 'formatting'), ('formatting', 'probl...","['work', 'canvas', 'jcanvas', 'netbeans', 'net...","['write', 'class', 'extends', 'canvas', 'jcanv...","[('work', 'canvas'), ('canvas', 'jcanvas'), ('...","[('write', 'class'), ('class', 'extends'), ('e...",1


In [16]:
#find top 9 communities by number of authors
top_9_communities = community_df.sort_values(by=['community_size'], ascending=False).iloc[:9]

#not clear if for each of 9 or across all 9 communities, across all is given:
#10 most frequent tokens in top 9 communities
fv = None
for i in range(9):
    fdist = nltk.FreqDist(top_9_communities['AllTokens'].iloc[i])
    if fv == None:
        fv = fdist
    else:
        fv = fv + fdist
fdist = fv
print(fdist.most_common(10))

#Token frequency distribution for each community:
flist = {}
for i in range(len(communities)):
    fdist = nltk.FreqDist(community_df['AllTokens'].loc[i])
    flist[i] = fdist

[('use', 35848), ('using', 28305), ('code', 27964), ('like', 27588), ('would', 26365), ('new', 22463), ('one', 20902), ('get', 20736), ('want', 20493), ('class', 20278)]


KeyError: 1045

In [395]:
top_9_communities

Unnamed: 0,AllTokens,AllBigrams,QuestionsTokens,AnswersTokens,QuestionsBigrams,AnswersBigrams,community_size
1,"[windows, xp, firewall, bug, css, issue, link,...","[(windows, xp), (xp, firewall), (firewall, bug...","[use, namespaces, sql, xml, query, nodes, comm...","[found, issue, thanks, mark, namespaces, must,...","[(use, namespaces), (namespaces, sql), (sql, x...","[(found, issue), (issue, thanks), (thanks, mar...",5171
4,"[restful, way, monitoring, rest, resource, cha...","[(restful, way), (way, monitoring), (monitorin...","[initialize, multidimensional, array, use, mem...","[int, myarray, myarray, new, int, tony, make, ...","[(initialize, multidimensional), (multidimensi...","[(int, myarray), (myarray, myarray), (myarray,...",3433
...,...,...,...,...,...,...,...
32,"[observer, possible, block, response, processi...","[(observer, possible), (possible, block), (blo...","[linux, system, manage, configurations, server...","[create, separate, program, started, root, pri...","[(linux, system), (system, manage), (manage, c...","[(create, separate), (separate, program), (pro...",968
2,"[add, bottom, padding, div, contains, floating...","[(add, bottom), (bottom, padding), (padding, d...","[add, bottom, padding, div, contains, floating...","[box, model, hack, basically, providing, ie, s...","[(add, bottom), (bottom, padding), (padding, d...","[(box, model), (model, hack), (hack, basically...",871


In [508]:
#idf for each token in corpus.
import math
import itertools
from tqdm import tqdm

def idf_for_token(token, communities,community_df):
    
    #faster time complexity than code from b4
    n_t = sum(community_df['AllTokens'].apply(lambda x:token in x))
    
    idf = int(math.log(len(communities)/n_t,10))
    with thread_lock:
        idf_dict[token] = idf
    return idf

tok_list =community_df['AllTokens']
unique_tokens  = list(set(list(itertools.chain.from_iterable(tok_list))))

idf_dict, thread_list = {}, [] #thread_list used for multithreading
thread_lock = threading.Lock()

for token in unique_tokens:
    t = threading.Thread(target=idf_for_token, args=(token, communities, community_df))
    thread_list.append(t)
# start the threads
for thread in tqdm(thread_list):
    thread.start()
# wait for all threads to complete
for thread in thread_list:
    thread.join()


##  IF RAN USING HPC - USE MULTITHREAD

KeyboardInterrupt: 

In [None]:
import itertools
import math
#Running using notebook, on login-node/personal pc
tok_list =community_df['AllTokens']
unique_tokens  = list(set(list(itertools.chain.from_iterable(tok_list))))

idf_dict = {}
def idf_for_token(token, communities,community_df):
    
    #faster time complexity than code from b4
    n_t = sum(community_df['AllTokens'].apply(lambda x:token in x))
    
    idf = int(math.log(len(communities)/n_t,10))
    return idf

for token in tqdm(unique_tokens):
    idf_dict[token] = idf_for_token(token, communities,community_df)

In [None]:
with open('/work3/s204161/comp_social_science_data/all_tokens_not_hpc.json', 'w') as outfile:
    json.dump(idf_dict, outfile)

In [None]:
### Much Faster Method But Consumes a lot of RAM. Has to be ran on HPC.

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# community_df = pd.read_csv('/work3/s204161/comp_social_science_data/community_df.csv', encoding='utf-8')
# community_df_a3 = pd.read_csv('/work3/s204161/comp_social_science_data/community_df_alpha3.csv', encoding='utf-8')

# community_df.AllTokens = community_df.AllTokens.apply(lambda x: literal_eval(str(x)))
# community_df.AllBigrams = community_df.AllBigrams.apply(lambda x: literal_eval(str(x)))

# community_df_a3.AllTokens = community_df_a3.AllTokens.apply(lambda x: literal_eval(str(x)))
# community_df_a3.AllBigrams = community_df_a3.AllBigrams.apply(lambda x: literal_eval(str(x)))


string_comms = [' '.join(doc) for doc in list(community_df['AllTokens'])]
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2,2),norm=None)

bi_fit = vectorizer.fit_transform(string_comms)
#tf_idf_scores = bi_fit.toarray()
bi_names = (vectorizer.get_feature_names())
tf_idf_scores = bi_fit.sum(axis=0)


In [32]:
bigram_idf_list = []
for column_index,term in enumerate(bi_names):
    bigrams


In [33]:
X1 = vectorizer.fit_transform(string_comms)

In [20]:
toke_list = community_df['AllBigrams']

In [22]:
list(set(list(itertools.chain.from_iterable(toke_list))))[0]

('implementation', 'special')

In [37]:
(X1.toarray())

MemoryError: Unable to allocate 17.8 GiB for an array with shape (1045, 2291092) and data type float64

In [36]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.models.phrases import Phrases, Phraser

# define the set of tokenized documents
bigram_docs =  list(community_df['AllBigrams'])

# create bigrams from the tokenized documents
bigram_phrases = Phrases(docs, min_count=1, threshold=1)
bigram_phraser = Phraser(bigram_phrases)
bigram_docs = [bigram_phraser[doc] for doc in docs]
print(bigram_docs)

# create a Dictionary object from the tokenized documents
dictionary = Dictionary(bigram_docs)

# convert the tokenized documents into bag-of-words vectors
corpus = [dictionary.doc2bow(doc) for doc in bigram_docs]

# compute the TF-IDF values using the TfidfModel
tfidf_model = TfidfModel(corpus)
tfidf_corpus = tfidf_model[corpus]

# create a dataframe from the results
data = []
for doc in tfidf_corpus:
    data.extend([(dictionary[id], freq) for id, freq in doc])
df = pd.DataFrame(data, columns=['term', 'score'])

# get the top 7 ranking bigrams
bigrams = [term for term in dictionary.token2id.keys() if '_' in term]
words = df[df['term'].isin(bigrams)].sort_values('score', ascending=False).head(7)

print('Top 7 ranking bigrams:\n', words)


TypeError: decoding to str: need a bytes-like object, tuple found