In [142]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext

conf = SparkConf()\
    .setMaster("local[*]")\
    .setAppName("GHTorrent Analytics")\
    .setExecutorEnv("spark.driver.memory","2g")\
    .setExecutorEnv("spark.executor.memory","4g")

spark = SparkSession\
    .builder\
    .config(conf=conf)\
    .getOrCreate()

sc = spark.sparkContext
sc

In [143]:
print('Default parallelism: {sc.defaultParallelism}')

Default parallelism: {sc.defaultParallelism}


In [1]:
#### data source https://ghtorrent.org/downloads.html

In [144]:
rdd = sc.textFile('ghtorrent-logs.txt.gz')

In [12]:
print(rdd.getNumPartitions())

1


In [145]:
# we need to increase number of partitions

rdd = rdd.repartition(8)

In [146]:
print(rdd.getNumPartitions())

8


### 1. Count the number of records and get 20 records randomly.

In [82]:
print(rdd.count())

9669788


In [18]:
rdd.takeSample(withReplacement=False, num=20, seed=42)

['DEBUG, 2017-03-23T09:20:00+00:00, ghtorrent-22 -- ghtorrent.rb: User astor76 exists',
 'DEBUG, 2017-03-24T12:52:34+00:00, ghtorrent-48 -- retriever.rb: Commit TelematicaTesteo6JK/ejercicio-1-cnunez9 -> 5b6b4fa034b0741a34774a4e55582bcb3529644d exists',
 'INFO, 2017-03-23T10:32:53+00:00, ghtorrent-9 -- ghtorrent.rb: Added issue_event HTBox/allReady -> 1929/HTBox/allReady -> 1929/1008256243',
 'DEBUG, 2017-03-23T11:24:39+00:00, ghtorrent-29 -- retriever.rb: Commit metakgp/mfqp -> 33b920c888347fb96ac144cf825620cd253e64e2 exists',
 'DEBUG, 2017-03-23T11:17:59+00:00, ghtorrent-6 -- ghtorrent.rb: Transaction committed (12 ms)',
 'INFO, 2017-03-23T13:16:00+00:00, ghtorrent-2 -- ghtorrent.rb: Added commit_assoc of 28ffe198bfab3761f97ce2f74ef479bafc5bbd99 with Nitrogen-CAF/android_frameworks_native',
 'DEBUG, 2017-03-23T09:23:47+00:00, ghtorrent-39 -- retriever.rb: Commit nvanbenschoten/cockroach -> eacc2ea06107956ab358ce5ebbb6275d50c7b862 exists',
 'INFO, 2017-03-23T10:29:16+00:00, ghtorrent-

## **GHTorrent data format**
Every line of this log file includes:
1.   Logging level, one of `DEBUG`, `INFO`, `WARN`, `ERROR`
2.   A timestamp
3.   The downloader id
4.   The logging stage including at least one of the following names:
       - `event_processing`
       - `ght_data_retrieval`
       - `api_client`
       - `retriever`
       - `ghtorrent`

### 2. Get the number of lines with both `Transaction` or `Repo` information.

In [148]:
import re

# \w returns a match where the string contains any word characters 
# (characters from a to Z, digits from 0-9, and the underscore _ character)
# + is one or more occurrences


def collect_words(line):
    return re.compile('\w+').findall(line.lower())

print(collect_words('Hello.worlD!, '))

['hello', 'world']


In [41]:
rdd_transaction = \
    rdd\
        .filter(lambda line: 'transaction' in collect_words(line))

rdd_repo= \
    rdd\
        .filter(lambda line: 'repo' in collect_words(line))

In [42]:
rdd_intersect = rdd_transaction.intersection(rdd_repo)

In [51]:
# rdd_intersect.take(4)

In [45]:
rdd_transaction_repo = \
    rdd\
        .filter(lambda line: 'transaction' in collect_words(line))\
        .filter(lambda line: 'repo' in collect_words(line))

rdd_transaction_repo.count()

43

In [46]:
rdd_transaction_repo.collect()

['DEBUG, 2017-03-23T09:13:17+00:00, ghtorrent-9 -- ghtorrent.rb: Repo xuminwlt/tcc-transaction exists',
 'DEBUG, 2017-03-23T09:26:01+00:00, ghtorrent-11 -- ghtorrent.rb: Repo jwpttcg66/redis-game-transaction exists',
 'DEBUG, 2017-03-23T09:26:01+00:00, ghtorrent-11 -- ghtorrent.rb: Repo jwpttcg66/redis-game-transaction exists',
 'DEBUG, 2017-03-23T09:13:16+00:00, ghtorrent-9 -- ghtorrent.rb: Repo xuminwlt/tcc-transaction exists',
 'DEBUG, 2017-03-23T09:26:01+00:00, ghtorrent-11 -- ghtorrent.rb: Repo jwpttcg66/redis-game-transaction exists',
 'DEBUG, 2017-03-23T09:13:28+00:00, ghtorrent-9 -- ghtorrent.rb: Repo xuminwlt/tcc-transaction exists',
 'DEBUG, 2017-03-23T09:13:27+00:00, ghtorrent-9 -- ghtorrent.rb: Repo xuminwlt/tcc-transaction exists',
 'DEBUG, 2017-03-23T09:26:01+00:00, ghtorrent-11 -- ghtorrent.rb: Repo jwpttcg66/redis-game-transaction exists',
 'DEBUG, 2017-03-23T10:28:03+00:00, ghtorrent-33 -- ghtorrent.rb: Repo jwpttcg66/redis-game-transaction exists',
 'DEBUG, 2017-03-23

### 3. Get the number of lines including `web link` for `WARN` logging levels.

In [147]:
def get_url(line):
    return re.findall(r'http[s]?://(?:[-\w.]|(?:%[\da-zA-z]{2,}))+', line)

o='jnljkj lklk lkm --- https://example.com.az.ho///bkjbk/lknlk7789'
get_url(o)

['https://example.com.az.ho']

In [114]:
warn_url_rdd = \
rdd\
    .filter(lambda line: line.split(',')[0] == 'WARN')\
    .filter(lambda line: len(get_url(line)) > 1)

print(warn_url_rdd.count())

57


In [115]:
warn_url_rdd.take(10)

['WARN, 2017-03-23T12:07:29+00:00, ghtorrent-18 -- api_client.rb: Failed request. URL: https://api.github.com/repos/wordpress-mobile/WordPress-iOS/labels/[Pri]%20Blocking, Exception: bad URI(is not URI?): https://api.github.com/repos/wordpress-mobile/WordPress-iOS/labels/[Pri]%20Blocking, Access: 2776f3ba0a5, IP: 0.0.0.0, Remaining: 1970',
 'WARN, 2017-03-22T20:14:51+00:00, ghtorrent-11 -- ghtorrent.rb: Extracted email(https://darmagedon@github.com) not valid for user darmagedon<https://darmagedon@github.com>',
 'WARN, 2017-03-23T13:00:25+00:00, ghtorrent-35 -- api_client.rb: Failed request. URL: https://api.github.com/repos/wordpress-mobile/WordPress-Android/labels/[Pri]%20Blocking, Exception: bad URI(is not URI?): https://api.github.com/repos/wordpress-mobile/WordPress-Android/labels/[Pri]%20Blocking, Access: 6d5ed163707, IP: 0.0.0.0, Remaining: 4997',
 'WARN, 2017-03-23T09:25:48+00:00, ghtorrent-6 -- api_client.rb: Failed request. URL: https://api.github.com/repos/wordpress-mobile/W

### 4. What is the most active `downloader id` for `Failed` connections?

In [129]:
rdd_failed = \
    rdd.filter(lambda line: 'failed' in collect_words(line))

rdd_active_downloader_rdd = \
    rdd_failed.map(lambda line: (line.replace('--',',')
                                    .split(',')[2]
                                    .split('-')[1]
                                    , 1))

rdd_active_downloader_rdd.take(5)

rdd_active_downloader_rdd \
    .reduceByKey(lambda x, y: x+y) \
    .sortBy(lambda x: x[1], ascending=False)\
    .take(10)

[('13 ', 79654),
 ('21 ', 2710),
 ('40 ', 1231),
 ('25 ', 505),
 ('9 ', 501),
 ('18 ', 495),
 ('11 ', 471),
 ('42 ', 465),
 ('6 ', 453),
 ('4 ', 452)]

### 5. What is the most active `repository`?

In [133]:
def collect_words2(line):
    return re.compile(' \w+ ').findall(line.lower())

rdd.filter(lambda line: " repo " in collect_words2(line)).take(5)

['DEBUG, 2017-03-23T11:15:38+00:00, ghtorrent-17 -- ghtorrent.rb: Repo roncodes/ember-intl-tel-input exists',
 'DEBUG, 2017-03-23T09:23:23+00:00, ghtorrent-39 -- ghtorrent.rb: Repo inabil/unix exists',
 'DEBUG, 2017-03-23T10:09:12+00:00, ghtorrent-26 -- ghtorrent.rb: Repo WangWei90/dolfin exists',
 'DEBUG, 2017-03-23T09:08:20+00:00, ghtorrent-17 -- ghtorrent.rb: Repo kidjp85/graphql-apollo-blog exists',
 'DEBUG, 2017-03-24T12:54:52+00:00, ghtorrent-46 -- ghtorrent.rb: Repo mgmagueye/magueyendiaye exists']

In [137]:
rdd.filter(lambda line: " repo " in collect_words(line))\
    .map(lambda line: line.lower().split('repo')[1].split(' ')[1])\
    .map(lambda x: (x, 1))\
    .reduceByKey(lambda x, y: x + y)\
    .sortBy(lambda x: x[1], ascending=False)\
    .take(10)

[('ovyx/hammerheadn', 22447),
 ('mithro/chromium-infra', 17033),
 ('lieblb/ilias', 14216),
 ('nitrogen-caf/android_system_core', 10186),
 ('winxdroid/android_build', 9535),
 ('gpkvt/ilias', 9408),
 ('nitrogen-caf/android_packages_apps_dialer', 8022),
 ('nitrogen-caf/android_system_extras', 7414),
 ('jorgeomarmh/edx', 6286),
 ('nitrogen-caf/android_frameworks_opt_telephony', 5873)]

### 6. Get the number of `Failed HTTP`requests per `hour`.

In [152]:
import re

# \w returns a match where the string contains any word characters 
# (characters from a to Z, digits from 0-9, and the underscore _ character)
# + is one or more occurrences


def collect_words(line):
    return re.compile('\w+').findall(line.lower())

rdd_failed = \
    rdd.filter(lambda line: 'failed' in collect_words(line))

rdd_failed\
    .map(lambda line: line.split(',')[1].split('T')[1].split('+')[0].split(':')[0])\
    .map(lambda x: (x, 1))\
    .reduceByKey(lambda x, y: x + y)\
    .sortBy(lambda x: x[1], ascending=False)\
    .collect()

[('11', 10567),
 ('10', 9991),
 ('12', 7500),
 ('20', 5772),
 ('13', 5712),
 ('09', 5325),
 ('14', 5144),
 ('22', 5092),
 ('19', 5092),
 ('21', 5091),
 ('16', 5090),
 ('18', 5090),
 ('17', 5090),
 ('15', 5090),
 ('00', 5088),
 ('23', 5088),
 ('01', 5088),
 ('08', 100),
 ('04', 100),
 ('05', 100),
 ('07', 100),
 ('02', 98),
 ('06', 98),
 ('03', 98)]