In [None]:
"""
The central idea behind the project is to research communities on a subset of StackOverFlow Q&A using a network describing interactions between users. It is interesting to answer questions like- What is the modularity of the Network, How discernable are the communities from each other in terms of topics discussed & How do they differ? A hypothesis could be that each community uses primarily their own programming language. We could also google the top 5 authors by score or amount of activity to see their specialization - It can also be interesting to look at score, does the highest scoring post depend on community size and to what degree if? Are some communities better at resolving questions? Look at proportion of closed to non-closed questions.

The dataset we use for this is 10% of StackOverflow Q&A version 2 from 2019 & contains 2~ million~ questions & 1.2~ million answers. Also write features in both in powerpoint.

Dataset is licensed under CC-BY-SA 3.0 with attribution required - meaning it can freely be modified & used both for commercial and research purposes as long as attributed. Top contributors are Miljan Stojiljkovic, Niyamat Ullah, Kartik Garg - See link for rest of contributors.
https://www.kaggle.com/datasets/stackoverflow/stacksample


Outline:
    * Download Dataset. - Done
    * Clean out rows with NaN (Except if the NaN is because a question is still open) and clean out stopwords, links, line breaks, etc.- Done
    * DataScrape to add UserNames/Names using OwnerUserId, requests & stackexchange API: https://api.stackexchange.com/docs
        PS! Since bandwith/quota is not very little compared to dataset size, only top author names are found instead of whole dataset.
        
    * Make Directed Network with Networkx and calculate best community split using Louvain. Nodes are persons and an edge from one person to another means that the first person replies to a question written by the other person. If multiple replies to same question, count this and add both as attribute.
    * WordCloud for best community splits + other diagnostics to analyse social network.

"""

In [1]:
##Loading and Removing NA Rows. Is important to not use for loop, when a solution of constant time complexity exists, when handling data.

In [1]:
import pandas as pd
import numpy as np

answers = '/work3/s204161/comp_social_science_data/Answers.csv'
questions = '/work3/s204161/comp_social_science_data/Questions.csv'
tags = '/work3/s204161/comp_social_science_data/Tags.csv'

answers_df = pd.read_csv(answers, encoding='ISO-8859-1')
questions_df = pd.read_csv(questions, encoding='ISO-8859-1')
tags_df = pd.read_csv(tags)

In [2]:
#clean rows with NA values in them.
clean_answers_df = answers_df.dropna(subset=answers_df.columns, axis=0, how='any')
clean_questions_df = questions_df.dropna(subset=questions_df.columns.difference(['ClosedDate']), axis=0, how='any')
print(f'Answers_df\nSize before: {len(answers_df.index)}\nSize After: {len(clean_answers_df.index)}')
print(f'Questions_df\nSize before: {len(questions_df.index)}\nSize After: {len(clean_questions_df.index)}')

clean_answers_df.to_csv('/work3/s204161/comp_social_science_data/no_NA_Answers.csv', encoding='ISO-8859-1', index=False)
clean_questions_df.to_csv('/work3/s204161/comp_social_science_data/no_NA_Questions.csv', encoding='ISO-8859-1', index=False)

Answers_df
Size before: 2014516
Size After: 2001316
Questions_df
Size before: 1264216
Size After: 1249762


In [14]:
print(clean_answers_df.iloc[0])
clean_answers_df.iloc[0].get('Body')


Id                                                             92
OwnerUserId                                                  61.0
CreationDate                                 2008-08-01T14:45:37Z
ParentId                                                       90
Score                                                          13
Body            <p><a href="http://svnbook.red-bean.com/">Vers...
Name: 0, dtype: object


'<p><a href="http://svnbook.red-bean.com/">Version Control with Subversion</a></p>\r\n\r\n<p>A very good resource for source control in general. Not really TortoiseSVN specific, though.</p>'

In [7]:
clean_questions_df

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...,...,...,...,...
1264211,40143210,5610777.0,2016-10-19T23:38:01Z,,0,URL routing in PHP (MVC),<p>I am building a custom MVC project and I ha...
1264212,40143300,3791161.0,2016-10-19T23:48:09Z,,0,Bigquery.Jobs.Insert - Resumable Upload?,<p>The API docs show that you should be able t...
1264213,40143340,7028647.0,2016-10-19T23:52:50Z,,1,Obfuscating code in android studio,<p>Under minifyEnabled I changed from false to...
1264214,40143360,871677.0,2016-10-19T23:55:24Z,,0,How to fire function after v-model change?,<p>I have input which I use to filter my array...


In [3]:
unique_users = np.unique([x for x in np.unique(clean_questions_df.get('OwnerUserId')+clean_answers_df.get('OwnerUserId'))])
print(f'unique_nodes: {len(unique_users)}')

unique_nodes: 1131290


In [79]:
unique_user_ids = np.asarray(np.asarray(np.unique(clean_answers_df.get('OwnerUserId')+clean_answers_df.get('OwnerUserId')),dtype=int),dtype=str)
pd.isna(np.asarray(np.unique(clean_answers_df.get('OwnerUserId')+clean_answers_df.get('OwnerUserId')),dtype=int)).any() #check if any NAN - should be false

False

In [81]:
import networkx as nx
G = nx.DiGraph()
G.add_nodes_from(unique_user_ids)

[nltk_data] Downloading package stopwords to
[nltk_data]     /zhome/a7/0/155527/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
import string
import nltk
from bs4 import BeautifulSoup
#nltk.download('stopwords')
#nltk.download('punkt')

tok_list = []
bad_tokens = set(['https:', 'http:', 'www.']) | set(nltk.corpus.stopwords.words('english')) | set(string.punctuation)
bad_tokens_2 = set(string.punctuation) | set(['[',']','(',')','{','}','<','>','/','\\','|',')',']'])

for i in range(len(clean_questions_df.get('OwnerUserId'))):
    body_text = clean_questions_df.iloc[i]['Body']
    clean_body_text = BeautifulSoup(body_text, 'html.parser').get_text()
    #print(clean_body_text) 321
    if i % 1000 == 0:
       print(i)
    tokens = nltk.word_tokenize(clean_body_text)
    tokens = [token.lower() for token in tokens]
    remove_indexes = []
    for i in range(len(tokens)):
        #remove bad tokens
        if any([(tokens[i] == bad_token) for bad_token in bad_tokens]):
            remove_indexes.append(i)
        elif any([(bad_token in tokens[i]) for bad_token in bad_tokens_2]):
            remove_indexes.append(i)
        elif tokens[i].isnumeric():
            remove_indexes.append(i)            
    
    for i in range(len(remove_indexes) - 1, -1, -1):
        remove_index = remove_indexes[i]
        del tokens[remove_index]

    if len(tokens) == 1:
        tokens.remove(tokens[0])

        #Maybe do biagrams, if not too computationally demanding, to get some temporal context
    tok_list.append(tokens)


clean_questions_df['tokens'] = tok_list

clean_questions_df.to_csv('/work3/s204161/comp_social_science_data/with_token_Questions.csv', encoding='ISO-8859-1', index=False)

#This takes some time to run, so is interrupted and ran overnight.

0
1000


KeyboardInterrupt: 

In [121]:
owner_attrs = {}

owner_found_bool = {}
#Add question/owner attributes to nodes:
for i in range(len(clean_questions_df.get('OwnerUserId'))):
    question_id = str(clean_questions_df.iloc[i].get('Id'))
    owner_id = str(int(clean_questions_df.iloc[i].get('OwnerUserId')))
    if not owner_found_bool.get(owner_id):
        owner_found_bool[owner_id] = True
        owner_attrs[owner_id] = {}
        owner_attrs[owner_id]['scores'] = {}
    owner_attrs[owner_id][question_id] = BeautifulSoup(clean_questions_df.iloc[i].get('Body'), 'html.parser').get_text()
    owner_attrs[owner_id]['scores'][question_id] = (clean_questions_df.iloc[i].get('Score'))    
    #Should also save question title in graph - implement here:
    

    if i % 50000 == 0:
        print(i)


0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000


In [129]:
answers_attrs = {}
answers_found_bool = {}

collab_network = {}

#Add answer attributes to nodes & remember that edge is directed from OwnerUserId --> ParentId:
for i in range(len(clean_answers_df.get('OwnerUserId'))):
    answers_id = str(clean_answers_df.iloc[i].get('Id'))
    owner_id = str(int(clean_answers_df.iloc[i].get('OwnerUserId')))
    parent_id = str(int(clean_answers_df.iloc[i].get('ParentId')))
    if not answers_found_bool.get(owner_id):
        answers_found_bool[owner_id] = True
        answers_attrs[owner_id] = {}
    if not owner_found_bool.get(owner_id):
        owner_found_bool[owner_id] = True
        owner_attrs[owner_id] = {}
        owner_attrs[owner_id]['scores'] = {}

    owner_attrs[owner_id][answers_id] = BeautifulSoup(clean_answers_df.iloc[i].get('Body'), 'html.parser').get_text()
    
    if not collab_network.get((owner_id, parent_id)):
        collab_network[(owner_id, parent_id)] = 1
    else:
        collab_network[(owner_id, parent_id)] += 1

    #we want to find scores for each person for answers too.
    owner_attrs[owner_id]['scores'][answers_id] = clean_answers_df.iloc[i].get('Score')
    if i % 50000 == 0:
        print(i)

nx.set_node_attributes(G, answers_attrs, 'Questions')

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000
1250000
1300000
1350000
1400000
1450000
1500000
1550000
1600000
1650000
1700000
1750000
1800000
1850000
1900000
1950000
2000000


In [144]:
for owner_id in unique_user_ids:
    if owner_attrs.get(owner_id):
        owner_attrs[owner_id]['best_score'] = np.max(list(owner_attrs[owner_id]['scores'].values()))
        owner_attrs[owner_id]['avg_score'] = np.mean(np.asarray(list(owner_attrs[owner_id]['scores'].values()),dtype=float))
        owner_attrs[owner_id]['median_score'] = np.median(list(owner_attrs[owner_id]['scores'].values()))


nx.set_node_attributes(G, owner_attrs, 'Questions')

In [159]:
#make edgelist and then edges in network.
edgelist = [(x[0][0], x[0][1], x[1]) for x in collab_network.items()]

G.add_weighted_edges_from(edgelist)


In [162]:
import pickle
#Save Graph
with open('/work3/s204161/comp_social_science_data/stackoverflow_network.pickle', 'wb') as f:
    pickle.dump(G, f, protocol=pickle.HIGHEST_PROTOCOL)
#Graph is big, so check Google Drive Link instead for both social network and csv files: 
# https://drive.google.com/drive/folders/11gTEA4omR2T6JZRMFCww7e2B1DcoqpmO?usp=share_link

In [170]:
print(f'Number of nodes in G: {G.number_of_nodes()}')
print(f'Number of edges in G: {G.number_of_edges()}')
print(f'Edges for node 61 in G: {G.edges(str(61))}')

print(f'Info about Node 61 in G: {G.nodes[str(61)]}')


Number of nodes in G: 1975473
Number of edges in G: 1993272
Edges for node 61 in G: [('61', '90'), ('61', '24270'), ('61', '47980'), ('61', '51390'), ('61', '142340'), ('61', '526660'), ('61', '1581560'), ('61', '2520220'), ('61', '6242540'), ('61', '6553950')]
Info about Node 61 in G: {'Questions': {'scores': {'8800': 63, '787850': 0, '1581560': 4, '92': 13, '33759': 1, '48055': 7, '51394': 2, '142425': 0, '526668': 3, '1782256': 0, '2520280': 8, '6242610': 8, '6556872': 7}, '8800': "So I've been poking around with C# a bit lately, and all the Generic Collections have me a little confused. Say I wanted to represent a data structure where the head of a tree was a key value pair, and then there is one optional list of key value pairs below that (but no more levels than these). Would this be suitable?\npublic class TokenTree\n{\n    public TokenTree()\n    {\n        /* I must admit to not fully understanding this,\n         * I got it from msdn. As far as I can tell, IDictionary is an\n

In [None]:
# Code below would find username for all users in dataset. Due to Quotas it is not possible however, so after we find top authors in our best community-split, we can request some of the top author names using stackexchange API.

In [105]:

# Since bandwith/quota is not very little compared to dataset size, only top author names are found instead of whole dataset. This code should be applied on top authors only then.
import requests
import time
remaining_users = []
username_dict = {}
for user_index in range(0,len(unique_user_ids),100):
    unsuccesful_tries = 0
    url = f'https://api.stackexchange.com/2.3/users/{";".join(map(str, unique_user_ids[user_index:user_index + 100]))}?site=meta.stackoverflow'
    #url = f'https://api.stackexchange.com/2.3/users/{61}?site=meta.stackoverflow'
    response = requests.get(url)
    while response.status_code != 200:
        print(f'Failed with status code: {response.status_code}')
        time.sleep(5)
        if unsuccesful_tries == 3:
            remaining_users.append(unique_user_ids[user_index:user_index + 100])
            continue
    
    #data = response.json()['items'][0]
    #username_dict[str(user)] = data['display_name']
    print('break for test')
    break
    

break for test


In [118]:
set1 = set(np.asarray(clean_answers_df.get('Id'),dtype=str))
set2 = set(np.asarray(clean_questions_df.get('Id'),dtype=str))
#Id is unique in whole dataset/on stackoverflow page - is not shared between answers & questions.
set1.intersection(set2)


set()