In [None]:
"""
The central idea behind the project is to research communities on a subset of StackOverFlow Q&A using a network describing interactions between users. It is interesting to answer questions like- What is the modularity of the Network, How discernable are the communities from each other in terms of topics discussed & How do they differ? A hypothesis could be that each community uses primarily their own programming language. We could also google the top 5 authors by score or amount of activity to see their specialization - It can also be interesting to look at score, does the highest scoring post depend on community size and to what degree if? Are some communities better at resolving questions? Look at proportion of closed to non-closed questions.

The dataset we use for this is 10% of StackOverflow Q&A version 2 from 2019 & contains 2~ million~ questions & 1.2~ million answers. Also write features in both in powerpoint.

Dataset is licensed under CC-BY-SA 3.0 with attribution required - meaning it can freely be modified & used both for commercial and research purposes as long as attributed. Top contributors are Miljan Stojiljkovic, Niyamat Ullah, Kartik Garg - See link for rest of contributors.
https://www.kaggle.com/datasets/stackoverflow/stacksample


Outline:
    * Download Dataset. - Done
    * Clean out rows with NaN (Except if the NaN is because a question is still open) and clean out stopwords, links, line breaks, etc.- Done
    * DataScrape to add UserNames/Names using OwnerUserId, requests & stackexchange API: https://api.stackexchange.com/docs
        PS! Since bandwith/quota is not very little compared to dataset size, only top author names are found instead of whole dataset.
        
    * Make Directed Network with Networkx and calculate best community split using Louvain. Nodes are persons and an edge from one person to another means that the first person replies to a question written by the other person. If multiple replies to same question, count this and add both as attribute.
    * WordCloud for best community splits + other diagnostics to analyse social network.

"""

In [1]:
##Loading and Removing NA Rows. Is important to not use for loop, when a solution of constant time complexity exists, when handling data.

In [1]:
import pandas as pd
import numpy as np

answers = '/work3/s204161/comp_social_science_data/Answers.csv'
questions = '/work3/s204161/comp_social_science_data/Questions.csv'
tags = '/work3/s204161/comp_social_science_data/Tags.csv'

answers_df = pd.read_csv(answers, encoding='ISO-8859-1')
questions_df = pd.read_csv(questions, encoding='ISO-8859-1')
tags_df = pd.read_csv(tags)

In [2]:
#clean rows with NA values in them.
clean_answers_df = answers_df.dropna(subset=answers_df.columns, axis=0, how='any')
clean_questions_df = questions_df.dropna(subset=questions_df.columns.difference(['ClosedDate']), axis=0, how='any')
print(f'Answers_df\nSize before: {len(answers_df.index)}\nSize After: {len(clean_answers_df.index)}')
print(f'Questions_df\nSize before: {len(questions_df.index)}\nSize After: {len(clean_questions_df.index)}')

clean_questions_df['OwnerUserId'] = np.asarray(clean_questions_df.get('OwnerUserId'),dtype=int)
clean_answers_df['OwnerUserId'] = np.asarray(clean_answers_df.get('OwnerUserId'),dtype=int)

clean_answers_df.to_csv('/work3/s204161/comp_social_science_data/no_NA_Answers.csv', encoding='ISO-8859-1', index=False)
clean_questions_df.to_csv('/work3/s204161/comp_social_science_data/no_NA_Questions.csv', encoding='ISO-8859-1', index=False)


Answers_df
Size before: 2014516
Size After: 2001316
Questions_df
Size before: 1264216
Size After: 1249762


In [5]:
clean_answers_df

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
2,199,50,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...
3,269,91,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,307,49,2008-08-02T01:49:46Z,260,28,"<p><a href=""http://www.codeproject.com/Article..."
...,...,...,...,...,...,...
2014511,40143247,333403,2016-10-19T23:42:35Z,40143190,0,"<p>Tanks to <a href=""http://stackoverflow.com/..."
2014512,40143322,642706,2016-10-19T23:50:35Z,40137110,1,<h1>tl;dr</h1>\n\n<pre><code>ZonedDateTime.par...
2014513,40143336,2239781,2016-10-19T23:52:08Z,40141860,0,<p>I came up with a very dirty workaround. Bef...
2014514,40143349,6934347,2016-10-19T23:54:02Z,40077010,0,<p>I solved my own problem defining the follow...


In [26]:
np.sort(clean_answers_df['ParentId'])

dtype('int64')

In [42]:
unique_users = np.unique(np.asarray(np.append(np.append(clean_questions_df.get('OwnerUserId'),clean_answers_df.get('OwnerUserId')),clean_answers_df.get('ParentId')),dtype=int))
print(f'unique_nodes: {len(unique_users)}')

unique_nodes: 1980582


In [28]:
np.append(clean_questions_df.get('OwnerUserId'),clean_answers_df.get('OwnerUserId')),clean_answers_df.get('ParentId')

0              177.0
1              164.0
2              313.0
3          2090091.0
4              400.0
             ...    
2014511          NaN
2014512          NaN
2014513          NaN
2014514          NaN
2014515          NaN
Length: 2011725, dtype: float64

In [38]:
np.append(np.append(clean_questions_df.get('OwnerUserId'),clean_answers_df.get('OwnerUserId')),clean_answers_df.get('ParentId'))

array([      26,       58,       83, ..., 40141860, 40077010, 40142910])

In [49]:
from community import community_louvain
best_partition = community_louvain.best_partition(nx_G)

ImportError: cannot import name 'community_louvain' from 'community' (/work3/s204161/miniconda/lib/python3.10/site-packages/community/__init__.py)

In [34]:
import networkx as nx
G = nx.DiGraph()
G.add_nodes_from(unique_user_ids)

0

In [45]:
import string
import nltk
from bs4 import BeautifulSoup
#nltk.download('stopwords')
#nltk.download('punkt')

tok_list = []
bad_tokens = set(['https:', 'http:', 'www.']) | set(nltk.corpus.stopwords.words('english')) | set(string.punctuation)
bad_tokens_2 = set(string.punctuation) | set(['[',']','(',')','{','}','<','>','/','\\','|',')',']'])

for i in range(len(clean_questions_df.get('OwnerUserId'))):
    body_text = clean_questions_df.iloc[i]['Body']
    clean_body_text = BeautifulSoup(body_text, 'html.parser').get_text()
    #print(clean_body_text) 321
    if i % 1000 == 0:
       print(i)
    tokens = nltk.word_tokenize(clean_body_text)
    tokens = [token.lower() for token in tokens]
    remove_indexes = []
    for i in range(len(tokens)):
        #remove bad tokens
        if any([(tokens[i] == bad_token) for bad_token in bad_tokens]):
            remove_indexes.append(i)
        elif any([(bad_token in tokens[i]) for bad_token in bad_tokens_2]):
            remove_indexes.append(i)
        elif tokens[i].isnumeric():
            remove_indexes.append(i)            
    
    for i in range(len(remove_indexes) - 1, -1, -1):
        remove_index = remove_indexes[i]
        del tokens[remove_index]

    if len(tokens) == 1:
        tokens.remove(tokens[0])

        #Maybe do biagrams, if not too computationally demanding, to get some temporal context
    tok_list.append(tokens)


clean_questions_df['tokens'] = tok_list

clean_questions_df.to_csv('/work3/s204161/comp_social_science_data/with_token_Questions.csv', encoding='ISO-8859-1', index=False)

#This takes some time to run, so is interrupted and ran overnight.

0
1000


KeyboardInterrupt: 

In [36]:
from bs4 import BeautifulSoup
owner_attrs = {}

owner_found_bool = {}
#Add question/owner attributes to nodes:
for i in range(len(clean_questions_df.get('OwnerUserId'))):
    question_id = str(int(clean_questions_df.iloc[i].get('Id')))
    owner_id = str(int(clean_questions_df.iloc[i].get('OwnerUserId')))
    if not owner_found_bool.get(owner_id):
        owner_found_bool[owner_id] = True
        owner_attrs[owner_id] = {}
        owner_attrs[owner_id]['scores'] = {}
    owner_attrs[owner_id][question_id] = BeautifulSoup(clean_questions_df.iloc[i].get('Body'), 'html.parser').get_text()
    owner_attrs[owner_id]['scores'][question_id] = (clean_questions_df.iloc[i].get('Score'))    
    #Should also save question title in graph - implement here:
    

    if i % 50000 == 0:
        print(i)


0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000


In [41]:
answers_attrs = {}
answers_found_bool = {}

collab_network = {}

#Add answer attributes to nodes & remember that edge is directed from OwnerUserId --> ParentId:
for i in range(len(clean_answers_df.get('OwnerUserId'))):
    answers_id = str(int(clean_answers_df.iloc[i].get('Id')))
    owner_id = str(int(clean_answers_df.iloc[i].get('OwnerUserId')))
    parent_id = str(int(clean_answers_df.iloc[i].get('ParentId')))
    if not answers_found_bool.get(owner_id):
        answers_found_bool[owner_id] = True
        answers_attrs[owner_id] = {}
    if not owner_found_bool.get(owner_id):
        owner_found_bool[owner_id] = True
        owner_attrs[owner_id] = {}
        owner_attrs[owner_id]['scores'] = {}

    owner_attrs[owner_id][answers_id] = BeautifulSoup(clean_answers_df.iloc[i].get('Body'), 'html.parser').get_text()
    
    if collab_network.get((owner_id, parent_id)) == None:
        collab_network[(owner_id, parent_id)] = 1
    else:
        collab_network[(owner_id, parent_id)] += 1

    #we want to find scores for each person for answers too.
    owner_attrs[owner_id]['scores'][answers_id] = clean_answers_df.iloc[i].get('Score')
    if i % 50000 == 0:
        print(i)

nx.set_node_attributes(G, answers_attrs, 'Questions')

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000
1250000
1300000
1350000
1400000
1450000
1500000
1550000
1600000
1650000
1700000
1750000
1800000
1850000
1900000
1950000
2000000


In [144]:
for owner_id in unique_user_ids:
    if owner_attrs.get(owner_id):
        owner_attrs[owner_id]['best_score'] = np.max(list(owner_attrs[owner_id]['scores'].values()))
        owner_attrs[owner_id]['avg_score'] = np.mean(np.asarray(list(owner_attrs[owner_id]['scores'].values()),dtype=float))
        owner_attrs[owner_id]['median_score'] = np.median(list(owner_attrs[owner_id]['scores'].values()))


nx.set_node_attributes(G, owner_attrs, 'Questions')

In [63]:
#make edgelist and then edges in network.
edgelist = [(x[0][0], x[0][1], x[1]) for x in collab_network.items()]

G.add_weighted_edges_from(edgelist)


In [75]:
collab_network.items()

In [74]:
G.nodes('90')

NodeDataView({}, data='90')

In [162]:
import pickle
#Save Graph
with open('/work3/s204161/comp_social_science_data/stackoverflow_network.pickle', 'wb') as f:
    pickle.dump(G, f, protocol=pickle.HIGHEST_PROTOCOL)
#Graph is big, so check Google Drive Link instead for both social network and csv files: 
# https://drive.google.com/drive/folders/11gTEA4omR2T6JZRMFCww7e2B1DcoqpmO?usp=share_link

In [170]:
print(f'Number of nodes in G: {G.number_of_nodes()}')
print(f'Number of edges in G: {G.number_of_edges()}')
print(f'Edges for node 61 in G: {G.edges(str(61))}')

print(f'Info about Node 61 in G: {G.nodes[str(61)]}')


Number of nodes in G: 1975473
Number of edges in G: 1993272
Edges for node 61 in G: [('61', '90'), ('61', '24270'), ('61', '47980'), ('61', '51390'), ('61', '142340'), ('61', '526660'), ('61', '1581560'), ('61', '2520220'), ('61', '6242540'), ('61', '6553950')]
Info about Node 61 in G: {'Questions': {'scores': {'8800': 63, '787850': 0, '1581560': 4, '92': 13, '33759': 1, '48055': 7, '51394': 2, '142425': 0, '526668': 3, '1782256': 0, '2520280': 8, '6242610': 8, '6556872': 7}, '8800': "So I've been poking around with C# a bit lately, and all the Generic Collections have me a little confused. Say I wanted to represent a data structure where the head of a tree was a key value pair, and then there is one optional list of key value pairs below that (but no more levels than these). Would this be suitable?\npublic class TokenTree\n{\n    public TokenTree()\n    {\n        /* I must admit to not fully understanding this,\n         * I got it from msdn. As far as I can tell, IDictionary is an\n

In [None]:
# Code below would find username for all users in dataset. Due to Quotas it is not possible however, so after we find top authors in our best community-split, we can request some of the top author names using stackexchange API.

In [105]:

# Since bandwith/quota is not very little compared to dataset size, only top author names are found instead of whole dataset. This code should be applied on top authors only then.
import requests
import time
remaining_users = []
username_dict = {}
for user_index in range(0,len(unique_user_ids),100):
    unsuccesful_tries = 0
    url = f'https://api.stackexchange.com/2.3/users/{";".join(map(str, unique_user_ids[user_index:user_index + 100]))}?site=meta.stackoverflow'
    #url = f'https://api.stackexchange.com/2.3/users/{61}?site=meta.stackoverflow'
    response = requests.get(url)
    while response.status_code != 200:
        print(f'Failed with status code: {response.status_code}')
        time.sleep(5)
        if unsuccesful_tries == 3:
            remaining_users.append(unique_user_ids[user_index:user_index + 100])
            continue

    #data = response.json()['items'][0]
    #username_dict[str(user)] = data['display_name']
    print('break for test')
    break
    

break for test


In [118]:
set1 = set(np.asarray(clean_answers_df.get('Id'),dtype=str))
set2 = set(np.asarray(clean_questions_df.get('Id'),dtype=str))
#Id is unique in whole dataset/on stackoverflow page - is not shared between answers & questions. thats why intersection is empty
set1.intersection(set2)


set()

In [13]:
import pickle
import networkx as nx
#Save Graph
with open('/work3/s204161/comp_social_science_data/stackoverflow_network.pickle', 'rb') as f:
    G = pickle.load(f)
#Graph is big, so check Google Drive Link instead for both social network and csv files: 
# https://drive.google.com/drive/folders/11gTEA4omR2T6JZRMFCww7e2B1DcoqpmO?usp=share_link

In [25]:
seen_nodes = {}

for node in G.nodes:
    if seen_nodes.get(int(node)):
        print('fucker is seen >:-(', node)
    seen_nodes[node] = True

In [None]:
######################
# NEW CODE STARTS HERE using full tokenized answers & questions
######################

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

answers = '/work3/s204161/comp_social_science_data/with_token_Answers.csv'
questions = '/work3/s204161/comp_social_science_data/with_token_Questions.csv'
questions_df = pd.read_csv(questions, encoding='utf-8')
answers_df = pd.read_csv(answers, encoding='utf-8')


In [2]:
answers_df.tokens = answers_df.tokens.apply(lambda x: literal_eval(str(x)))

In [213]:
#subset by Time
subset_questions_df = questions_df[questions_df.CreationDate.str[:4].astype('int') == 2009]
subset_answers_df = answers_df[(2010 > answers_df.CreationDate.str[:4].astype('int')) & (answers_df.CreationDate.str[:4].astype('int') >= 2009)]


In [215]:
#make dict to ensure linear time instead of quadratic
subset_question_dict = {}
for ques in list(subset_questions_df.Id):
    subset_question_dict[ques] = True

In [216]:
import tqdm
for ans_index in tqdm.tqdm(range(len(subset_answers_df)-1, -1,-1)):
    parid = subset_answers_df.iloc[ans_index].ParentId
    if not subset_question_dict.get(parid):
        subset_answers_df.drop(subset_answers_df.index[ans_index], inplace=True)
        #question_dict.get(subset_answers_df.iloc[ans_index].ParentId)


100%|██████████| 92887/92887 [01:36<00:00, 959.57it/s]


In [310]:
print('question_df_length:',len(subset_questions_df))
print('answers_df_length:',len(answers_df[(2010 > answers_df.CreationDate.str[:4].astype('int')) & (answers_df.CreationDate.str[:4].astype('int') >= 2009)]))
print('answers_df_length_after_removing answers to questions from 2008 or previous years:', len(subset_answers_df))

question_df_length: 31088
answers_df_length: 92887
answers_df_length_after_removing answers to questions from 2008 or previous years: 82497


In [322]:
unique_users = np.unique(np.asarray(np.asarray(np.append(subset_questions_df.get('OwnerUserId'),subset_answers_df.get('OwnerUserId')),dtype=int),dtype=str))
print(f'unique_nodes: {len(unique_users)}')


unique_nodes: 30133


In [225]:
import string
import nltk
from bs4 import BeautifulSoup
def tokenize_col(dat_frame_df,colname):
    nltk.download('stopwords')
    nltk.download('punkt')

    #tok_list = []
    bad_tokens = set(['https:', 'http:', 'www.']) | set(nltk.corpus.stopwords.words('english')) | set(string.punctuation)
    bad_tokens_2 = set(string.punctuation) | set(['[',']','(',')','{','}','<','>','/','\\','|',')',']'])

    tok_list = []
    for i in range(len(dat_frame_df.get('OwnerUserId'))):
        body_text = dat_frame_df.iloc[i][colname]
        clean_body_text = BeautifulSoup(body_text, 'html.parser').get_text()
        #print(clean_body_text) 321
        if i % 5000 == 0:
            print(i)
        tokens = nltk.word_tokenize(clean_body_text)
        tokens = [token.lower() for token in tokens]
        remove_indexes = []
        for i in range(len(tokens)):
            #remove bad tokens
            if any([(tokens[i] == bad_token) for bad_token in bad_tokens]):
                remove_indexes.append(i)
            elif any([(bad_token in tokens[i]) for bad_token in bad_tokens_2]):
                remove_indexes.append(i)
            elif tokens[i].isnumeric():
                remove_indexes.append(i)            
        
        for i in range(len(remove_indexes) - 1, -1, -1):
            remove_index = remove_indexes[i]
            del tokens[remove_index]

        if len(tokens) == 1:
            tokens.remove(tokens[0])
            #Maybe do biagrams, if not too computationally demanding, to get temporal context
        tok_list.append(tokens)

    return tok_list 



In [226]:
subset_questions_df['TitleTokens'] = tokenize_col(subset_questions_df,'Title')

[nltk_data] Downloading package stopwords to
[nltk_data]     /zhome/a7/0/155527/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /zhome/a7/0/155527/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
0
5000
10000
15000
20000
25000
30000


In [227]:
subset_questions_df.to_csv('/work3/s204161/comp_social_science_data/subset_questions.csv', encoding='utf-8', index=False)
subset_answers_df.to_csv('/work3/s204161/comp_social_science_data/subset_answers.csv', encoding='utf-8', index=False)



In [329]:
import networkx as nx
G = nx.DiGraph()
G.add_nodes_from(unique_users)

In [330]:
import string
import nltk
from bs4 import BeautifulSoup
quest_attrs = {}

quest_found_bool = {}
#Add question/owner attributes to nodes:
for i in range(len(subset_questions_df.get('OwnerUserId'))):
    question_id = str(subset_questions_df.iloc[i].get('Id'))
    owner_id = str(int(subset_questions_df.iloc[i].get('OwnerUserId')))
    #If first time user is asking, create dict
    if not quest_found_bool.get(owner_id):
        quest_found_bool[owner_id] = True
        quest_attrs[owner_id] = {}
        quest_attrs[owner_id]['Scores'] = {}
    #owner_attrs[owner_id][question_id] = BeautifulSoup(subset_questions_df.iloc[i].get('Body'), 'html.parser').get_text()
    quest_attrs[owner_id]['Scores'][question_id] = (subset_questions_df.iloc[i].get('Score'))    
    #Should also save question title in graph - implement here:

    if i % 10000 == 0:
        print(i)
nx.set_node_attributes(G, quest_attrs, 'Questions')

0
10000
20000
30000


In [338]:
answers_attrs = {}
answers_found_bool = {}

collab_network = {}

#Add answer attributes to nodes & remember that edge is directed from OwnerUserId --> OwnerUserId (parent):
for i in range(len(subset_answers_df.get('OwnerUserId'))):
    answers_id = str(subset_answers_df.iloc[i].get('Id'))
    owner_id = str(int(subset_answers_df.iloc[i].get('OwnerUserId')))
    post_parent_id = str(int(subset_answers_df.iloc[i].get('ParentId')))
    owner_parent_id = str(int(subset_questions_df[subset_questions_df.Id == int(post_parent_id)].OwnerUserId))
    #if first time user is answering create dict
    if not answers_found_bool.get(owner_id):
        answers_found_bool[owner_id] = True
        answers_attrs[owner_id] = {}
        answers_attrs[owner_id]['Scores'] = {}
    #owner_attrs[owner_id][answers_id] = BeautifulSoup(clean_answers_df.iloc[i].get('Body'), 'html.parser').get_text()

    if not collab_network.get((owner_id, owner_parent_id)):
        collab_network[(owner_id, owner_parent_id)] = 1
    else:
        collab_network[(owner_id, owner_parent_id)] += 1

    #we want to find scores for each person for answers too.
    answers_attrs[owner_id]['Scores'][answers_id] = subset_answers_df.iloc[i].get('Score')
    if i % 10000 == 0:
        print(i)

nx.set_node_attributes(G, answers_attrs, 'Answers')

0
10000
20000
30000
40000
50000
60000
70000
80000


In [293]:

#Should be updated if calculate for best, avg. median...
# for owner_id in unique_users:
#     if owner_attrs.get(owner_id):
#         owner_attrs[owner_id]['BestScore'] = np.max(list(owner_attrs[owner_id]['Scores'].values()))
#         owner_attrs[owner_id]['AvgScore'] = np.mean(np.asarray(list(owner_attrs[owner_id]['scores'].values()),dtype=float))
#         owner_attrs[owner_id]['MedianScore'] = np.median(list(owner_attrs[owner_id]['scores'].values()))


# nx.set_node_attributes(G, owner_attrs, 'Questions')



In [339]:
#make edgelist and then edges in network.
edgelist = [(x[0][0], x[0][1], x[1]) for x in collab_network.items()]

G.add_weighted_edges_from(edgelist)


In [362]:
import pickle
with open('/work3/s204161/comp_social_science_data/stackoverflow_subset_network.pickle', 'wb') as f:
    pickle.dump(G, f, protocol=pickle.HIGHEST_PROTOCOL)

In [340]:
G.in_edges('404430')

InEdgeDataView([('31641', '4'), ('43219', '4'), ('40650', '4'), ('4', '4'), ('130812', '4')])

In [348]:
G.nodes['85162'] #some users are only active once a year, so subsetting by "time" will not encapsulate all posts made by a person who is active throughout many years. maybe do it by tag instead?

{'Answers': {'Scores': {'701489': 0, '701781': 3}}}

In [356]:
#I
questions_df[questions_df.OwnerUserId.astype(int).astype(str) == '85162']

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,tokens
77691,3558350,85162.0,2010-08-24T15:52:41Z,,1,Cross domain script tag working in FF and Chro...,<p>We are providing a snippit of HTML that our...,"['providing', 'snippit', 'html', 'client', 'em..."
702614,24291060,85162.0,2014-06-18T16:48:51Z,,0,HQL join query to join to a single row of many...,<p>This HQL query has been driving me up a wal...,"['hql', 'query', 'driving', 'wall', 'hope', 's..."


In [357]:
answers_df[answers_df.OwnerUserId.astype(int).astype(str) == '85162']

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body,tokens
38133,701489,85162.0,2009-03-31T15:10:50Z,701200,0,<p>Try to get the object back from your extern...,"[try, get, object, back, external, interface, ..."
38156,701781,85162.0,2009-03-31T16:08:37Z,701200,3,"<p><a href=""http://osmanu.com/flex/EITest/EITe...","[working, sample, based, code, provided, right..."


In [355]:
answers_df.OwnerUserId

0               61.0
1               26.0
2               50.0
3               91.0
4               49.0
             ...    
2001311     333403.0
2001312     642706.0
2001313    2239781.0
2001314    6934347.0
2001315    4464432.0
Name: OwnerUserId, Length: 2001316, dtype: float64