# Introduction

This project will analyse a data set of Hacker News comments to find out: 
1. Do Ask HN or Show HN receive more comments on average?
2. Do posts created at a certain time receive more comments on average?

In [3]:
hn_open_file = open('HN_posts_year_to_Sep_26_2016.csv')
from csv import reader
hn_read_file = reader(hn_open_file)
hn = list(hn_read_file)

print(hn[:5])

[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12579008', 'You have two days to comment if you want stem cells to be classified as your own', 'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018', '1', '0', 'altstar', '9/26/2016 3:26'], ['12579005', 'SQLAR  the SQLite Archiver', 'https://www.sqlite.org/sqlar/doc/trunk/README.md', '1', '0', 'blacksqr', '9/26/2016 3:24'], ['12578997', 'What if we just printed a flatscreen television on the side of our boxes?', 'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43', '1', '0', 'pavel_lishin', '9/26/2016 3:19'], ['12578989', 'algorithmic music', 'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext', '1', '0', 'poindontcare', '9/26/2016 3:16']]


In [26]:
headers = hn[0]
hn = hn[1:]
print(len(hn))

293113


In [14]:
ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    if title.lower().startswith('ask hn'):
        ask_posts.append(row)
    elif title.lower().startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)
        
print('Ask Posts:', len(ask_posts), "   Percentage:", (len(ask_posts)/len(hn)))
print('Show Posts:', len(show_posts), "   Percentage:", (len(show_posts)/len(hn)))
print('Any Other Posts:', len(other_posts), "   Percentage:", (len(other_posts)/len(hn)))

Ask Posts: 9139    Percentage: 0.031178569722773764
Show Posts: 10158    Percentage: 0.034654985364256034
Any Other Posts: 273821    Percentage: 0.9341664449129702


From our data set, ~3.1% of the posts are 'Ask Hacker News' and ~3.5% of the posts are 'Show Hacker News'. The remaining ~93.4% are other posts. 

# Calculating Average Number of Comments

In [27]:
#Ask Posts
total_ask_comments = 0
for posts in ask_posts:
    n_comments = int(posts[4])
    total_ask_comments += n_comments

avg_ask_comments = total_ask_comments / len(ask_posts)
print('Average Ask HN comments:', avg_ask_comments)

#Show Posts
total_show_comments = 0
for posts in show_posts:
    n_comments = int(posts[4])
    total_show_comments += n_comments
    
avg_show_comments = total_show_comments / len(show_posts)
print('Average Show HN comments:', avg_show_comments)

Average Ask HN comments: 10.393478498741656
Average Show HN comments: 4.886099625910612


This code runs through the 'Ask HN' and 'Show HN' posts and generates the average number of comments per post. It shows that 'Ask HN' posts recieve ~5.5 more comments. 


# Analysing by Time Created

In [35]:
import datetime as dt

result_list = []
for posts in ask_posts:
    created = posts[6]
    n_comments = int(posts[4])
    result_list.append([created, n_comments])
    
print('Example data:')
for var in result_list[:3]:
      print(var, '\n')

Example data:
['9/26/2016 2:53', 7] 

['9/26/2016 1:17', 3] 

['9/25/2016 22:57', 0] 



In [48]:
counts_by_hour = {}
comments_by_hour = {}
date_format = "%m/%d/%Y %H:%M"

for row in result_list:
    time = row[0]
    comments = row[1]
    date_1 = dt.datetime.strptime(time, date_format)
    hour = dt.datetime.strftime(date_1, '%H')
    if hour not in counts_by_hour:
        counts_by_hour[hour] = 1
        comments_by_hour[hour] = comments
    else:
        counts_by_hour[hour] += 1
        comments_by_hour[hour] += comments
    
print(comments_by_hour)

{'02': 2996, '01': 2089, '22': 3372, '21': 4500, '19': 3954, '17': 5547, '15': 18525, '14': 4972, '13': 7245, '11': 2797, '10': 3013, '09': 1477, '07': 1585, '03': 2154, '23': 2297, '20': 4462, '16': 4466, '08': 2362, '00': 2277, '18': 4877, '12': 4234, '04': 2360, '06': 1587, '05': 1838}


In [52]:
avg_comments_by_hour = []
for hour in comments_by_hour:
    avg_comments_by_hour.append([hour, comments_by_hour[hour] / counts_by_hour[hour]])

avg_comments_by_hour

[['02', 11.137546468401487],
 ['01', 7.407801418439717],
 ['22', 8.804177545691905],
 ['21', 8.687258687258687],
 ['19', 7.163043478260869],
 ['17', 9.449744463373083],
 ['15', 28.676470588235293],
 ['14', 9.692007797270955],
 ['13', 16.31756756756757],
 ['11', 8.96474358974359],
 ['10', 10.684397163120567],
 ['09', 6.653153153153153],
 ['07', 7.013274336283186],
 ['03', 7.948339483394834],
 ['23', 6.696793002915452],
 ['20', 8.749019607843136],
 ['16', 7.713298791018998],
 ['08', 9.190661478599221],
 ['00', 7.5647840531561465],
 ['18', 7.94299674267101],
 ['12', 12.380116959064328],
 ['04', 9.7119341563786],
 ['06', 6.782051282051282],
 ['05', 8.794258373205741]]

In [61]:
swap_avg_by_hour = []
for row in avg_comments_by_hour:
    swap_avg_by_hour.append([row[1], row[0]])
    
sorted_swap = sorted(swap_avg_by_hour, reverse=True)

print('Top 5 Hours for Ask Posts Comments')

for avg, time in sorted_swap[0:4]:
    print('{}: {:.2f} average comments per post'.format(dt.datetime.strptime(time, '%H').strftime('%H:%M'), avg))

Top 5 Hours for Ask Posts Comments
15:00: 28.68 average comments per post
13:00: 16.32 average comments per post
12:00: 12.38 average comments per post
02:00: 11.14 average comments per post
