In [1]:
import sys
import os
import random
from operator import add, mul
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles
from pyspark.sql import SparkSession, SQLContext

In [2]:
spark = SparkSession.builder.appName("M5-CA2-ApacheLogs-TGA").getOrCreate() # singleton instance

### 1. Load file as a text file in spark

In [3]:
# Load input.csv into HDFS
# !hdfs dfs -mkdir use_cases/Logs 
# !hdfs dfs -put access.clean.log  use_cases/Logs

%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\


127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"

- 127.0.0.1 (%h): This is the IP address of the client (remote host) which made the request to the server. 
- (%l): The "hyphen" in the output indicates that the requested piece of information is not available. In this case, the information that is not available is the RFC 1413 identity of the client determined by identd on the clients machine. 
- frank (%u) - This is the userid of the person requesting the document as determined by HTTP authentication. If the document is not password protected, this entry will be "-" just like the previous one.
- [10/Oct/2000:13:55:36 -0700] (%t): The time that the server finished processing the request. The format is:[day/month/year:hour:minute:second zone]
- day = 2*digit, month = 3*letter, year = 4*digit, hour = 2*digit, minute = 2*digit, second = 2*digit,zone = (`+' | `-') 4*digit

- "GET /apache_pb.gif HTTP/1.0" (\"%r\"): The request line from the client is given in double quotes. The request line contains a great deal of useful information. First, the method used by the client is GET. Second, the client requested the resource /apache_pb.gif, and third, the client used the protocol HTTP/1.0. It is also possible to log one or more parts of the request line independently. 
- 200 (%>s) This is the status code that the server sends back to the client. This information is very valuable, because it reveals whether the request resulted in a successful response (codes beginning in 2), a redirection (codes beginning in 3), an error caused by the client (codes beginning in 4), or an error in the server (codes beginning in 5). The full list of possible status codes can be found in the HTTP specification (RFC2616 section 10).
- 2326 (%b): The last entry indicates the size of the object returned to the client, not including the response headers. If no content was returned to the client, this value will be "-". To log "0" for no content, use %B instead.
- "http://www.example.com/start.html" (\"%{Referer}i\")- The "Referer" (sic) HTTP request header. This gives the site that the client reports having been referred from. (This should be the page that links to or includes /apache_pb.gif).
- "Mozilla/4.08 [en] (Win98; I ;Nav)" (\"%{User-agent}i\")- The User-Agent HTTP request header. This is the identifying information that the client browser reports about itself.

Reference
https://httpd.apache.org/docs/1.3/logs.html

In [4]:
# test code
# import re
# pattern = r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] (\S+)\s?(\S+)?\s?(\S+)? (\d{3}|-) (\d+|-)\s?([^"]*)\s?"?([^"]*)?$'
# logline = '109.169.248.247 - - [12/Dec/2015:18:25:11 +0100] POST /administrator/index.php HTTP/1.1 200 4494 http://almhuette-raith.at/administrator/ Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0'
# matchh = re.search(pattern, logline)

# for g in range(1, 12):
#     print(matchh.group(g))

In [5]:
location = "/user/edureka_672184/use_cases/Logs/access.clean.log"
# load spark dataframe
raw = spark.sparkContext.textFile(location)

In [6]:
import re
from pyspark.sql.types import Row
# Returns  a  dictionary  containing  the  parts  of  the  Apache Access  Log.
def parse_apache_log_line(logline):
    pattern = r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] (\S+)\s?(\S+)?\s?(\S+)? (\d{3}|-) (\d+|-)\s?([^"]*)\s?"?([^"]*)?$'
    match =  re.search(pattern,  logline)
    if  match  is None:
        return  Row(
        ipAddress = '0.0.0.0',
        clientIdentd = '-',
        userId = '-',
        dateTime = '01/Jan/1970:00:00:00',
        dateTimeZone = '+0100',
        method  = 'GET',
        endpoint = 'Unknown',
        protocol = 'HTTP/1.1',
        responseCode =  200,
        contentSize  =  '1000',
        referer = 'Unknown'
    ) 
    else:
        g10 = match.group(10).strip("- ").strip(" -").strip()
        if(g10.startswith("http")):
            referer_ = g10.split()[0]
        else:
            referer_ = "-"
            
        return  Row(
            ipAddress = match.group(1),
            clientIdentd = match.group(2),
            userId = match.group(3),
            dateTime = match.group(4).split()[0],
            dateTimeZone = match.group(4).split()[1],
            method  = match.group(5),
            endpoint = match.group(6),
            protocol = match.group(7),
            responseCode =  int(match.group(8)),
            contentSize  =  match.group(9),
            referer = referer_
        )

In [7]:
logs_df = raw.map(parse_apache_log_line).cache().toDF()
logs_df.show()

+------------+-----------+--------------------+------------+--------------------+---------------+------+--------+--------------------+------------+------+
|clientIdentd|contentSize|            dateTime|dateTimeZone|            endpoint|      ipAddress|method|protocol|             referer|responseCode|userId|
+------------+-----------+--------------------+------------+--------------------+---------------+------+--------+--------------------+------------+------+
|           -|       4263|12/Dec/2015:18:25:11|       +0100|     /administrator/|109.169.248.247|   GET|HTTP/1.1|                   -|         200|     -|
|           -|       4494|12/Dec/2015:18:25:11|       +0100|/administrator/in...|109.169.248.247|  POST|HTTP/1.1|http://almhuette-...|         200|     -|
|           -|       4263|12/Dec/2015:18:31:08|       +0100|     /administrator/|    46.72.177.4|   GET|HTTP/1.1|                   -|         200|     -|
|           -|       4494|12/Dec/2015:18:31:08|       +0100|/administr

### 2. Find out how many 404 HTTP codes are in access logs.

In [8]:
http404 = logs_df.filter(logs_df["responseCode"] == 404)
print(http404.count())

227089


### 3. Find out which URLs are broken.

In [19]:
print(http404.select("referer").distinct().show(20, False))

+---------------------------------------------------------------------------------------------------+
|referer                                                                                            |
+---------------------------------------------------------------------------------------------------+
|http://almhuette-raith.at/administrator/components/com_sef/views/logger/config.php                 |
|http://almhuette-raith.at/administrator/components/com_banners/web-infor.php                       |
|http://almhuette-raith.at/components/com_content/views/featured/tmpl/config.php                    |
|http://almhuette-raith.at/layouts/system_info.php.suspected                                        |
|http://almhuette-raith.at/images/legacy.cms.php.suspected                                          |
|http://almhuette-raith.at/_private/logo_img.php.suspected                                          |
|http://almhuette-raith.at/tmp/headers_img.php                                    

### 4. Verify there are no null columns in the original dataset.

I already checked null columns when parsing the apache log file

### 5. Replace null values with constants such as 0

I created a row with this values when null is encountered

ipAddress = '0.0.0.0',
clientIdentd = '-',
userId = '-',
dateTime = '01/Jan/1970:00:00:00 +0100',
method  = 'GET',
endpoint = 'Unknown',
protocol = 'HTTP/1.1',
responseCode =  200,
contentSize  =  '1000',
referer = 'Unknown'

In [10]:
nulls = logs_df.filter((logs_df["ipAddress"] == '0.0.0.0') & (logs_df["referer"] == 'Unknown'))
print("There are ", nulls.count(), " nulls in the data")

('There are ', 31, ' nulls in the data')


### 6. Parse timestamp to readable date.

In [11]:
logs_df.select("datetime").show(5, False)

+--------------------+
|datetime            |
+--------------------+
|12/Dec/2015:18:25:11|
|12/Dec/2015:18:25:11|
|12/Dec/2015:18:31:08|
|12/Dec/2015:18:31:08|
|12/Dec/2015:18:31:25|
+--------------------+
only showing top 5 rows



Reference https://stackoverflow.com/questions/39088473/pyspark-dataframe-convert-unusual-string-format-to-timestamp

In [13]:
from pyspark.sql.functions import col
from pyspark.sql import Row
from pyspark.sql.functions import unix_timestamp
logs_df2 = logs_df.withColumn("dateTimeParsed", unix_timestamp("datetime", "dd/MMM/yyyy:hh:mm:ss").cast("double").cast("timestamp"))
logs_df2.select("dateTimeParsed").show(5, False)

+---------------------+
|dateTimeParsed       |
+---------------------+
|2015-12-12 18:25:11.0|
|2015-12-12 18:25:11.0|
|2015-12-12 18:31:08.0|
|2015-12-12 18:31:08.0|
|2015-12-12 18:31:25.0|
+---------------------+
only showing top 5 rows



### 7. Describe which HTTP status values appear in data and how many.

In [16]:
logs_df2.groupBy("responseCode").count().orderBy('count').show()

+------------+-------+
|responseCode|  count|
+------------+-------+
|         412|     19|
|         400|     23|
|         406|     53|
|         405|     83|
|         401|    135|
|         501|    143|
|         303|    247|
|         301|    619|
|         403|   2222|
|         500|   3252|
|         304|   6330|
|         404| 227089|
|         206| 939929|
|         200|1157862|
+------------+-------+



### 8. Display as chart the above stat in chart in Zeppelin notebook

In [15]:
# We didn't learn Zeepelin and it is not installed in our envirioment

### 9. How many unique hosts are there in the entire log and their average request

In [21]:
logs_df2.groupBy("ipAddress").count().orderBy('count').show(5)

+---------------+-----+
|      ipAddress|count|
+---------------+-----+
|     45.61.46.3|    1|
|139.162.150.131|    1|
|     5.83.104.2|    1|
|  192.3.195.106|    1|
| 179.215.122.32|    1|
+---------------+-----+
only showing top 5 rows



### 10.Create a spark-submit application for the same and print the findings in the log

In [None]:
!spark2-submit /mnt/home/edureka_672184/m5ca2q10.py

19/07/23 07:19:38 INFO spark.SparkContext: Running Spark version 2.1.0.cloudera2
19/07/23 07:19:39 INFO spark.SecurityManager: Changing view acls to: edureka_672184
19/07/23 07:19:39 INFO spark.SecurityManager: Changing modify acls to: edureka_672184
19/07/23 07:19:39 INFO spark.SecurityManager: Changing view acls groups to: 
19/07/23 07:19:39 INFO spark.SecurityManager: Changing modify acls groups to: 
19/07/23 07:19:39 INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(edureka_672184); groups with view permissions: Set(); users  with modify permissions: Set(edureka_672184); groups with modify permissions: Set()
19/07/23 07:19:40 INFO util.Utils: Successfully started service 'sparkDriver' on port 46740.
19/07/23 07:19:40 INFO spark.SparkEnv: Registering MapOutputTracker
19/07/23 07:19:40 INFO spark.SparkEnv: Registering BlockManagerMaster
19/07/23 07:19:40 INFO storage.BlockManagerMasterEndpoint: Using org.apache.s

19/07/23 07:19:53 INFO hive.HiveUtils: Initializing HiveMetastoreConnection version 1.1.0 using file:/opt/cloudera/parcels/CDH-5.11.1-1.cdh5.11.1.p0.4/lib/hadoop/../hive/lib/commons-logging-1.1.3.jar:file:/opt/cloudera/parcels/CDH-5.11.1-1.cdh5.11.1.p0.4/lib/hadoop/../hive/lib/hive-exec-1.1.0-cdh5.11.1.jar:file:/opt/cloudera/parcels/CDH-5.11.1-1.cdh5.11.1.p0.4/lib/hadoop/../hive/lib/hive-exec.jar:file:/opt/cloudera/parcels/CDH-5.11.1-1.cdh5.11.1.p0.4/lib/hadoop/../hive/lib/hive-jdbc-1.1.0-cdh5.11.1-standalone.jar:file:/opt/cloudera/parcels/CDH-5.11.1-1.cdh5.11.1.p0.4/lib/hadoop/../hive/lib/hive-jdbc-1.1.0-cdh5.11.1.jar:file:/opt/cloudera/parcels/CDH-5.11.1-1.cdh5.11.1.p0.4/lib/hadoop/../hive/lib/hive-jdbc-standalone.jar:file:/opt/cloudera/parcels/CDH-5.11.1-1.cdh5.11.1.p0.4/lib/hadoop/../hive/lib/hive-jdbc.jar:file:/opt/cloudera/parcels/CDH-5.11.1-1.cdh5.11.1.p0.4/lib/hadoop/../hive/lib/hive-metastore-1.1.0-cdh5.11.1.jar:file:/opt/cloudera/parcels/CDH-5.11.1-1.cdh5.11.1.p0.4/lib/hadoop

19/07/23 07:19:55 INFO session.SessionState: Created local directory: /tmp/881edcc5-7354-4eae-b129-7459f1616338_resources
19/07/23 07:19:55 INFO session.SessionState: Created HDFS directory: /tmp/hive/edureka_672184/881edcc5-7354-4eae-b129-7459f1616338
19/07/23 07:19:55 INFO session.SessionState: Created local directory: /tmp/edureka_672184/881edcc5-7354-4eae-b129-7459f1616338
19/07/23 07:19:55 INFO session.SessionState: Created HDFS directory: /tmp/hive/edureka_672184/881edcc5-7354-4eae-b129-7459f1616338/_tmp_space.db
19/07/23 07:19:55 INFO session.SessionState: No Tez session required at this point. hive.execution.engine=mr.
19/07/23 07:19:55 INFO client.HiveClientImpl: Warehouse location for Hive client (version 1.1.0) is /user/hive/warehouse
19/07/23 07:19:55 INFO hive.metastore: Trying to connect to metastore with URI thrift://ip-20-0-21-161.ec2.internal:9083
19/07/23 07:19:55 INFO hive.metastore: Opened a connection to metastore, current connections: 1
19/07/23 07:19:55 INFO hive

19/07/23 07:20:37 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 7639 ms on ip-20-0-31-210.ec2.internal (executor 1) (1/4)
19/07/23 07:20:48 INFO storage.BlockManagerInfo: Added rdd_2_1 in memory on ip-20-0-31-210.ec2.internal:33647 (size: 46.4 MB, free: 319.8 MB)
19/07/23 07:20:50 INFO storage.BlockManagerInfo: Added rdd_2_2 in memory on ip-20-0-31-210.ec2.internal:44741 (size: 36.5 MB, free: 282.1 MB)
19/07/23 07:20:55 INFO scheduler.TaskSetManager: Starting task 3.0 in stage 1.0 (TID 4, ip-20-0-31-210.ec2.internal, executor 2, partition 3, NODE_LOCAL, 6051 bytes)
19/07/23 07:20:55 INFO scheduler.TaskSetManager: Finished task 1.0 in stage 1.0 (TID 2) in 26229 ms on ip-20-0-31-210.ec2.internal (executor 2) (2/4)
19/07/23 07:20:56 INFO scheduler.TaskSetManager: Finished task 2.0 in stage 1.0 (TID 3) in 19044 ms on ip-20-0-31-210.ec2.internal (executor 1) (3/4)
19/07/23 07:21:05 INFO storage.BlockManagerInfo: Added rdd_2_3 in memory on ip-20-0-31-210.ec2.intern