In [5]:
import os
import re
import operator
# import simplejson
import numpy as np
import scipy.stats as scp
import pandas as pd
from lxml import objectify
from pandas import Series, DataFrame
from collections import Counter
import matplotlib.pyplot as plt

from xml.etree import ElementTree as ET


tags_xlxpath = "Tags.xlsx"
post_xlspath = "Posts.xlsx"
user_xlspath = "Users.xlsx"
cmnt_xlspath = "Comments.xlsx"
votes_xlspath = "Votes.xlsx"
badges_xlspath = "Badges.xlsx"
postlinks_path = "PostLinks.xlsx"

post_records = pd.read_excel(post_xlspath)
tags_records = pd.read_excel(tags_xlxpath, header=1)
user_records = pd.read_excel(user_xlspath, header=1)
vote_records = pd.read_excel(votes_xlspath, header=1)



# postlinks_rec = pd.read_excel(postlinks_path, header=1)
# bdges_records = pd.read_excel(badges_xlspath, header=1)
# comnt_records = pd.read_excel(cmnt_xlspath, header=1)



# Global function
def header_cleaner(strng):
    """ Clean column header for display"""
    match = re.search(r'@\w+', strng)
    if match:
        return match.group().replace('@','')
    else:
        return strng

## What fraction of posts contain the 5th most popular tag?

In [11]:
sorted_tags[:6]

[('r', 7244),
 ('regression', 5413),
 ('time-series', 2737),
 ('machine-learning', 2564),
 ('probability', 2081),
 ('hypothesis-testing', 2000)]

In [6]:

# Visualize table's structure and identify (tags of interest).

tags_records.columns = [header_cleaner(col) for col in tags_records.keys()]
print(tags_records.head(5))


tags_dict = pd.Series(tags_records.Count.values, index=tags_records.TagName.values).to_dict()
sorted_tags = sorted(tags_dict.items(), key=operator.itemgetter(1), reverse=True)
print("\n\n Top 5 Tags \n============\n", sorted_tags[:5])


# For length/numbers of posts. We do not care about post without tags.
total_posts = post_records.PostTypeId.count()
print("\n\nlength of posts : ", total_posts)
poptag = sorted_tags[5][1]/total_posts

print("\n\nFraction of post having 5th most popular tag:")
print("\n\n Freq. of 5th popular tag DIVIDED by Length of posts")
print("\n %s / %s = %.9f" %(sorted_tags[5][1], total_posts, poptag ))


   Count  ExcerptPostId  Id      TagName  WikiPostId
0   1342          20258   1     bayesian       20257
1    168          62158   2        prior       62157
2      6            NaN   3  elicitation         NaN
3    191          67815   4    normality       67814
4     13            NaN   5  open-source         NaN


 Top 5 Tags 
 [('r', 7244), ('regression', 5413), ('time-series', 2737), ('machine-learning', 2564), ('probability', 2081)]


length of posts :  91976


Fraction of post having 5th most popular tag:


 Freq. of 5th popular tag DIVIDED by Length of posts

 2000 / 91976 = 0.021744803


## How much higher is the average answer's score than the average question's?

In [3]:
# How much higher is the average answer's score than the average question's?
# Cleanup the post table for display
post_records.columns = [header_cleaner(col) for col in post_records.keys()]

# Sample display of a single record
post_records[:1]

print("Total of different post types \t where 1: Question \t 2: Answer\n")
print(post_records.PostTypeId.value_counts())


stk_posts = pd.DataFrame(post_records, columns=['PostTypeId', 'Score'])
questions = stk_posts[stk_posts["PostTypeId"]==1]
answers = stk_posts[stk_posts["PostTypeId"]==2]

avg_score_diff = answers.Score.mean() - questions.Score.mean()

print("\n\n Mean Answ Score - Mean Ques. Score : %.9f" %avg_score_diff)

Total of different post types 	 where 1: Question 	 2: Answer

2    47755
1    42921
5      640
4      640
6        9
3        6
7        5
dtype: int64


 Mean Answ Score - Mean Ques. Score : 0.646649100


## What is the Pearson's correlation between a user's reputation and total score from posts (for valid users)?

In [4]:
# What is the Pearson's correlation between a user's reputation and total score from posts (for valid users)?

user_records.columns = [header_cleaner(col) for col in user_records.keys()]

user_reput = Series(user_records.Reputation.values, index = user_records.AccountId.values).to_dict()
usr_post_scr = Series(post_records.Score.values, index = post_records.OwnerUserId.values).to_dict()

# Compute total score of valid users. 
valid_users = {key:0 for (key, value) in user_reput.items()}

for (key, value) in usr_post_scr.items():
    if key in valid_users.keys():
        valid_users[key] += value


        
user_reputation = np.array([rep for rep in user_reput.values()])

valid_users_score = np.array([value for value in valid_users.values()])

# # Pearson test using scipy module
prsnr_test = scp.stats.pearsonr(user_reputation, valid_users_score)

# #
print("(p-corr, p-signf) = ", prsnr_test)

print("\nPearson corr test : %.9f" %prsnr_test[0])



(p-corr, p-signf) =  (0.0032314629836997276, 0.51640661691877243)

Pearson corr test : 0.003231463


## How many more upvotes does the average answer receive than the average question?

In [5]:
# How many more upvotes does the average answer receive than the average question?

vote_records.columns = [header_cleaner(key) for key in vote_records.keys()]

# Extract differnet categories of posts.
vote_categs = pd.DataFrame(vote_records, columns=['PostId', 'VoteTypeId'])
print("Vote types - \t  1: Accepted \t 2: UPvotes \t 3: DOWNvotes \t 4: Offensive \t 5: FAVorites\n")

post_categ = DataFrame(post_records, columns=['Id', 'PostTypeId'])
post_categ.columns = ['PostId', 'PostTypeId']

tot_quetn = len(post_categ[post_categ['PostTypeId'] ==1])
tot_answer = len(post_categ[post_categ['PostTypeId'] ==2])


vote_post_type = pd.merge(post_categ, vote_categs, on='PostId', how='inner', sort=True)

is_upvote = vote_post_type['VoteTypeId'] == 2

upvoted_posts = vote_post_type[is_upvote]

upvoted_ques = len(upvoted_posts[upvoted_posts['PostTypeId']==1])

upvoted_answr = len(upvoted_posts[upvoted_posts['PostTypeId']==2])

ans = upvoted_answr/tot_answer
que = upvoted_ques/tot_quetn

print("Average Upvoted Answer: %.6f " %ans)

print("Avegrage Upvoted Question %.6f " %que)

print("\nDifference: %0.9f" %(ans-que))

Vote types - 	  1: Accepted 	 2: UPvotes 	 3: DOWNvotes 	 4: Offensive 	 5: FAVorites

Average Upvoted Answer: 3.192357 
Avegrage Upvoted Question 2.555043 

Difference: 0.637313835


## Subimission

In [187]:
import os
import re
import operator
import simplejson
import numpy as np
import scipy.stats as scp
import pandas as pd
from lxml import objectify
from pandas import Series, DataFrame
from collections import Counter
import matplotlib.pyplot as plt

from xml.etree import ElementTree as ET


tags_xlxpath = "Tags.xlsx"
post_xlspath = "Posts.xlsx"
user_xlspath = "Users.xlsx"
cmnt_xlspath = "Comments.xlsx"
votes_xlspath = "Votes.xlsx"
badges_xlspath = "Badges.xlsx"
postlinks_path = "PostLinks.xlsx"

post_records = pd.read_excel(post_xlspath)
tags_records = pd.read_excel(tags_xlxpath, header=1)
user_records = pd.read_excel(user_xlspath, header=1)
vote_records = pd.read_excel(votes_xlspath, header=1)

# Global function
def header_cleaner(strng):
    """ Clean column header for display"""
    match = re.search(r'@\w+', strng)
    if match:
        return match.group().replace('@','')
    else:
        return strng

# What fraction of posts contain the 5th most popular tag?

# Visualize table's structure and identify (tags of interest).

tags_records.columns = [header_cleaner(col) for col in tags_records.keys()]
print(tags_records.head(5))


tags_dict = pd.Series(tags_records.Count.values, index=tags_records.TagName.values).to_dict()
sorted_tags = sorted(tags_dict.items(), key=operator.itemgetter(1), reverse=True)
print("\n\n Top 5 Tags \n============\n", sorted_tags[:5])


# For length/numbers of posts. We do not care about post without tags.
total_posts = post_records.PostTypeId.count()
print("\n\nlength of posts : ", total_posts)
poptag = sorted_tags[5][1]/total_posts

print("\n\nFraction of post having 5th most popular tag:")
print("\n\n Freq. of 5th popular tag DIVIDED by Length of posts")
print("\n %s / %s = %.7f" %(sorted_tags[5][1], total_posts, poptag ))



# How much higher is the average answer's score than the average question's?
# Cleanup the post table for display
post_records.columns = [header_cleaner(col) for col in post_records.keys()]

# Sample display of a single record
post_records[:1]

print("Total of different post types \t where 1: Question \t 2: Answer\n")
print(post_records.PostTypeId.value_counts())


stk_posts = pd.DataFrame(post_records, columns=['PostTypeId', 'Score'])
questions = stk_posts[stk_posts["PostTypeId"]==1]
answers = stk_posts[stk_posts["PostTypeId"]==2]

avg_score_diff = answers.Score.mean() - questions.Score.mean()

print("\n\n Mean Answ Score - Mean Ques. Score : %.7f" %avg_score_diff)


# What is the Pearson's correlation between a user's reputation and total score from posts (for valid users)?

user_records.columns = [header_cleaner(col) for col in user_records.keys()]

user_reput = Series(user_records.Reputation.values, index = user_records.AccountId.values).to_dict()
usr_post_scr = Series(post_records.Score.values, index = post_records.OwnerUserId.values).to_dict()

# Compute total score of valid users. 
valid_users = {key:0 for (key, value) in user_reput.items()}

for (key, value) in usr_post_scr.items():
    if key in valid_users.keys():
        valid_users[key] += value


        
user_reputation = np.array([rep for rep in user_reput.values()])

valid_users_score = np.array([value for value in valid_users.values()])

# # Pearson test using scipy module
prsnr_test = scp.stats.pearsonr(user_reputation, valid_users_score)

# #
print("(p-corr, p-signf) = ", prsnr_test)

print("\nPearson corr test : %.8f" %prsnr_test[0])


# How many more upvotes does the average answer receive than the average question?

vote_records.columns = [header_cleaner(key) for key in vote_records.keys()]

# Extract differnet categories of posts.
vote_categs = pd.DataFrame(vote_records, columns=['PostId', 'VoteTypeId'])
print("Vote types - \t  1: Accepted \t 2: UPvotes \t 3: DOWNvotes \t 4: Offensive \t 5: FAVorites\n")

post_categ = DataFrame(post_records, columns=['Id', 'PostTypeId'])
post_categ.columns = ['PostId', 'PostTypeId']

tot_quetn = len(post_categ[post_categ['PostTypeId'] ==1])
tot_answer = len(post_categ[post_categ['PostTypeId'] ==2])


vote_post_type = pd.merge(post_categ, vote_categs, on='PostId', how='inner', sort=True)

is_upvote = vote_post_type['VoteTypeId'] == 2

upvoted_posts = vote_post_type[is_upvote]

upvoted_ques = len(upvoted_posts[upvoted_posts['PostTypeId']==1])

upvoted_answr = len(upvoted_posts[upvoted_posts['PostTypeId']==2])

ans = upvoted_answr/tot_answer
que = upvoted_ques/tot_quetn

print("Average Upvoted Answer: %.6f " %ans)

print("Avegrage Upvoted Question %.6f " %que)

print("\nDifference: %f" %(ans-que))

## WorkArea

In [83]:
print([header_cleaner(key) for key in post_records.keys()])

['Id', 'PostTypeId', 'AcceptedAnswerId', 'CreationDate', 'Score', 'ViewCount', 'Body', 'OwnerUserId', 'LastActivityDate', 'Title', 'Tags', 'AnswerCount', 'CommentCount', 'FavoriteCount', 'LastEditorUserId', 'LastEditDate', 'CommunityOwnedDate', 'ParentId', 'ClosedDate', 'OwnerDisplayName', 'LastEditorDisplayName']


In [113]:
print([header_cleaner(key) for key in bdges_records.keys()])

['Date', 'Id', 'Name', 'UserId']


In [104]:

[header_cleaner(key) for key in vote_records.keys()]


['BountyAmount', 'CreationDate', 'Id', 'PostId', 'UserId', 'VoteTypeId']

In [106]:

print([header_cleaner(str(key)) for key in comnt_records.keys()])


['CreationDate', 'Id', 'PostId', 'Score', 'Text', 'UserDisplayName', 'UserId']


In [107]:
print([header_cleaner(str(key)) for key in user_records.keys()])

['AboutMe', 'AccountId', 'Age', 'CreationDate', 'DisplayName', 'DownVotes', 'Id', 'LastAccessDate', 'Location', 'ProfileImageUrl', 'Reputation', 'UpVotes', 'Views', 'WebsiteUrl']


In [502]:
print([header_cleaner(str(key)) for key in postlinks_rec.keys()])

['CreationDate', 'Id', 'LinkTypeId', 'PostId', 'RelatedPostId']


In [504]:
post_records

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,...,AnswerCount,CommentCount,FavoriteCount,LastEditorUserId,LastEditDate,CommunityOwnedDate,ParentId,ClosedDate,OwnerDisplayName,LastEditorDisplayName
0,1,1,15,2010-07-19T19:12:12.510,23,1278,<p>How should I elicit prior distributions fro...,8,2010-09-15T21:08:26.077,Eliciting priors from experts,...,5,1,14,,NaT,NaT,,,,
1,2,1,59,2010-07-19T19:12:57.157,22,8198,<p>In many different statistical methods there...,24,2012-11-12T09:21:54.993,What is normality?,...,7,1,8,88,2010-08-07 17:56:44.800,NaT,,,,
2,3,1,5,2010-07-19T19:13:28.577,54,3613,<p>What are some valuable Statistical Analysis...,18,2013-05-27T14:48:36.927,What are some valuable Statistical Analysis op...,...,19,4,36,183,2011-02-12 05:50:03.667,2010-07-19 19:13:28.577,,,,
3,4,1,135,2010-07-19T19:13:31.617,13,5224,<p>I have two groups of data. Each with a dif...,23,2010-09-08T03:00:19.690,Assessing the significance of differences in d...,...,5,2,2,,NaT,NaT,,,,
4,5,2,,2010-07-19T19:14:43.050,81,,"<p>The R-project</p>\n\n<p><a href=""http://www...",23,2010-07-19T19:21:15.063,,...,,3,,23,2010-07-19 19:21:15.063,2010-07-19 19:14:43.050,3,,,
5,6,1,,2010-07-19T19:14:44.080,152,29229,"<p>Last year, I read a blog post from <a href=...",5,2014-05-29T03:54:31.943,The Two Cultures: statistics vs. machine learn...,...,15,5,137,22047,2013-06-07 06:38:10.327,2010-08-09 13:05:50.603,,,,
6,7,1,18,2010-07-19T19:15:59.303,76,5808,<p>I've been working on a new method for analy...,38,2013-12-28T06:53:10.860,Locating freely available data samples,...,24,3,79,253,2013-09-26 21:50:36.963,2010-07-20 20:50:48.483,,,,
7,8,1,,2010-07-19T19:16:21.737,0,288,"<p>Sorry, but the emptyness was a bit overwhel...",37,2010-10-18T07:57:31.170,So how many staticians *does* it take to screw...,...,1,2,,449,2010-10-18 07:57:31.170,NaT,,2010-07-19T20:19:46.577,,
8,9,2,,2010-07-19T19:16:27.553,13,,"<p><a href=""http://incanter.org/"">Incanter</a>...",50,2010-07-19T19:16:27.553,,...,,3,,,NaT,2010-07-19 19:16:27.553,3,,,
9,10,1,1887,2010-07-19T19:17:47.537,23,21925,<p>Many studies in the social sciences use Lik...,24,2012-10-23T17:33:41.907,Under what conditions should Likert scales be ...,...,4,4,12,919,2011-03-30 15:31:46.003,NaT,,,,
