# Práctica NASA

In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession
        .builder
        .appName("Nasa")
        .getOrCreate())

In [2]:
# nasa_file = "C:/Users/xabier.jimenez/Downloads/NASA logs project/NASA_access_log_Jul95"
nasa_file = "C:/Users/xabier.jimenez/Downloads/NASA logs project/NASA_access_log_*"
df = (spark.read.format("text")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(nasa_file))
df.show()

+--------------------+
|               value|
+--------------------+
|in24.inetnebr.com...|
|uplherc.upl.com -...|
|uplherc.upl.com -...|
|uplherc.upl.com -...|
|uplherc.upl.com -...|
|ix-esc-ca2-07.ix....|
|uplherc.upl.com -...|
|slppp6.intermind....|
|piweba4y.prodigy....|
|slppp6.intermind....|
|slppp6.intermind....|
|ix-esc-ca2-07.ix....|
|slppp6.intermind....|
|uplherc.upl.com -...|
|133.43.96.45 - - ...|
|kgtyk4.kj.yamagat...|
|kgtyk4.kj.yamagat...|
|d0ucr6.fnal.gov -...|
|ix-esc-ca2-07.ix....|
|d0ucr6.fnal.gov -...|
+--------------------+
only showing top 20 rows



In [3]:
# nasa_file_2 = "C:/Users/xabier.jimenez/Downloads/NASA logs project/NASA_access_log_Aug95 (1)"

# df2 = (spark.read.format("text")
#         .option("header", "true")
#         .option("inferSchema", "true")
#         .load(nasa_file_2))

In [4]:
# df3=df.union(df2)

In [5]:
sample_logs = [item['value'] for item in df.collect()]
sample_logs

['in24.inetnebr.com - - [01/Aug/1995:00:00:01 -0400] "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0" 200 1839',
 'uplherc.upl.com - - [01/Aug/1995:00:00:07 -0400] "GET / HTTP/1.0" 304 0',
 'uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/ksclogo-medium.gif HTTP/1.0" 304 0',
 'uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/MOSAIC-logosmall.gif HTTP/1.0" 304 0',
 'uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/USA-logosmall.gif HTTP/1.0" 304 0',
 'ix-esc-ca2-07.ix.netcom.com - - [01/Aug/1995:00:00:09 -0400] "GET /images/launch-logo.gif HTTP/1.0" 200 1713',
 'uplherc.upl.com - - [01/Aug/1995:00:00:10 -0400] "GET /images/WORLD-logosmall.gif HTTP/1.0" 304 0',
 'slppp6.intermind.net - - [01/Aug/1995:00:00:10 -0400] "GET /history/skylab/skylab.html HTTP/1.0" 200 1687',
 'piweba4y.prodigy.com - - [01/Aug/1995:00:00:10 -0400] "GET /images/launchmedium.gif HTTP/1.0" 200 11853',
 'slppp6.intermind.net - - [01/Aug/1995:00:00:11 -0400] "GET

In [6]:
import re
host_pattern = r'(^\S+\.[\S+\.]+\S+)\s'
hosts = [re.search(host_pattern, item).group(1)
           if re.search(host_pattern, item)
           else 'no match'
           for item in sample_logs]

In [7]:
ts_pattern = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
timestamps = [re.search(ts_pattern, item).group(1)
                if re.search(ts_pattern, item)
                else 'no match'
                for item in sample_logs]

In [8]:
method_uri_protocol_pattern = r'\"(\S+)\s(\S+)\s*(\S*)\"'
method_uri_protocol = [re.search(method_uri_protocol_pattern, item).groups()
               if re.search(method_uri_protocol_pattern, item)
               else 'no match'
              for item in sample_logs]

In [9]:
status_pattern = r'\s(\d{3})\s'
status = [re.search(status_pattern, item).group(1) 
            if re.search(status_pattern, item)
            else 'no match'
            for item in sample_logs]

In [10]:
content_size_pattern = r'\s(\d+|-)$'
content_size = [re.search(content_size_pattern, item).group(1)
            if re.search(content_size_pattern, item)
            else 'no match'
            for item in sample_logs]

In [11]:
from pyspark.sql.functions import regexp_extract

logs_df = df.select(regexp_extract('value', host_pattern, 1).alias('host'),
                         regexp_extract('value', ts_pattern, 1).alias('timestamp'),
                         regexp_extract('value', method_uri_protocol_pattern, 1).alias('method'),
                         regexp_extract('value', method_uri_protocol_pattern, 2).alias('endpoint'),
                         regexp_extract('value', method_uri_protocol_pattern, 3).alias('protocol'),
                         regexp_extract('value', status_pattern, 1).cast('integer').alias('status'),
                         regexp_extract('value', content_size_pattern, 1).alias('content_size'))
logs_df.show(20, truncate=True)
print((logs_df.count(), len(logs_df.columns)))

+--------------------+--------------------+------+--------------------+--------+------+------------+
|                host|           timestamp|method|            endpoint|protocol|status|content_size|
+--------------------+--------------------+------+--------------------+--------+------+------------+
|   in24.inetnebr.com|01/Aug/1995:00:00...|   GET|/shuttle/missions...|HTTP/1.0|   200|        1839|
|     uplherc.upl.com|01/Aug/1995:00:00...|   GET|                   /|HTTP/1.0|   304|           0|
|     uplherc.upl.com|01/Aug/1995:00:00...|   GET|/images/ksclogo-m...|HTTP/1.0|   304|           0|
|     uplherc.upl.com|01/Aug/1995:00:00...|   GET|/images/MOSAIC-lo...|HTTP/1.0|   304|           0|
|     uplherc.upl.com|01/Aug/1995:00:00...|   GET|/images/USA-logos...|HTTP/1.0|   304|           0|
|ix-esc-ca2-07.ix....|01/Aug/1995:00:00...|   GET|/images/launch-lo...|HTTP/1.0|   200|        1713|
|     uplherc.upl.com|01/Aug/1995:00:00...|   GET|/images/WORLD-log...|HTTP/1.0|   304|    

In [13]:
from pyspark.sql import functions as F
logs_df_completo = df.select(regexp_extract('value', host_pattern, 1).alias('host'),
                         regexp_extract('value', ts_pattern, 1).alias('timestamp'),
                         regexp_extract('value', method_uri_protocol_pattern, 1).alias('method'),
                         regexp_extract('value', method_uri_protocol_pattern, 2).alias('endpoint'),
                         regexp_extract('value', method_uri_protocol_pattern, 3).alias('protocol'),
                         regexp_extract('value', status_pattern, 1).cast('integer').alias('status'),
                         regexp_extract('value', content_size_pattern, 1).alias('content_size'),
                         F.col("value"))

In [14]:
(logs_df.write.format("parquet")
 .mode("overwrite")
 .option("compression", "snappy")
 .save("/tmp/data/parquet/df_parquet_nasa"))

In [15]:
file = """/tmp/data/parquet/df_parquet_nasa"""
logs_df_parquet = spark.read.format("parquet").load(file)

¿Cuáles son los distintos protocolos web utilizados? Agrúpalos.

In [16]:

preg1 = logs_df_parquet.groupBy(F.col('Protocol')).count().show()

+-------------+-------+
|     Protocol|  count|
+-------------+-------+
|       HTTP/*|     13|
|            a|      1|
|             |   6600|
|    HTTP/V1.0|    279|
|     HTTP/1.0|3454716|
|STS-69</a><p>|      4|
+-------------+-------+



In [17]:
# logs_df_parquet_id = logs_df_parquet.withColumn("id", F.monotonically_increasing_id())
# df_id = df.withColumn("id", F.monotonically_increasing_id())

# dfjoin=logs_df_parquet_id.join(df_id, df_id.id== logs_df_parquet_id.id)

# dfjoin.filter(~F.col('Protocol').isin('HTTP/*','HTTP/V1.0','HTTP/1.0','STS-69</a><p>'))\
#                         .select("Protocol","value").show(7000,truncate=False)

In [18]:
logs_df_completo.filter(~F.col('Protocol').isin('HTTP/*','HTTP/V1.0','HTTP/1.0','STS-69</a><p>'))\
                         .select("Protocol","value").show(7000,truncate=False)

+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Protocol|value                                                                                                                                                                                                                                                                                                                                                                                                |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

¿Cuáles son los códigos de estado más comunes en la web? Agrúpalos y ordénalos para ver cuál es el más común.

In [19]:
preg2 = logs_df_parquet.groupBy(F.col('status')).count().orderBy(F.col('count').desc()).show()

+------+-------+
|status|  count|
+------+-------+
|   200|3100524|
|   304| 266773|
|   302|  73070|
|   404|  20899|
|   403|    225|
|   500|     65|
|   501|     41|
|   400|     15|
|  null|      1|
+------+-------+



In [20]:
logs_df_completo.filter(F.col('status').isNull())\
                        .select("status","value").show(truncate=False)

+------+--------+
|status|value   |
+------+--------+
|null  |alyssa.p|
+------+--------+



¿Y los métodos de petición (verbos) más utilizados?

In [21]:
preg3 = logs_df_parquet.groupBy(F.col('method')).count().orderBy(F.col('count').desc()).show()

+---------------+-------+
|         method|  count|
+---------------+-------+
|            GET|3451720|
|           HEAD|   7915|
|               |   1754|
|           POST|    222|
|�|t�9ð'À|u|      2|
+---------------+-------+



In [22]:
logs_df_completo.filter(~F.col('method').isin('GET','HEAD','POST',''))\
                         .select("method","value").show(2000,truncate=False)

+---------------+----------------------------------------------------------------------------+
|method         |value                                                                       |
+---------------+----------------------------------------------------------------------------+
|�|t�9ð'À|u|163.206.42.13 - - [31/Aug/1995:11:04:42 -0400] "�|t�9ð'À|u&�G" 400 -|
|�|t�9ð'À|u|163.206.42.13 - - [31/Aug/1995:11:04:49 -0400] "�|t�9ð'À|u&�G" 400 -|
+---------------+----------------------------------------------------------------------------+



¿Qué recurso tuvo la mayor transferencia de bytes de la página web?

In [23]:
preg4 = logs_df_parquet.groupBy(F.col('endpoint')).agg(F.sum(F.col('content_size')).alias('content_size'))\
                                        .orderBy(F.col('content_size').desc()).show(truncate=False)

+------------------------------------------------------------+-------------+
|endpoint                                                    |content_size |
+------------------------------------------------------------+-------------+
|/shuttle/missions/sts-71/movies/sts-71-launch.mpg           |3.195286412E9|
|/shuttle/missions/sts-71/movies/sts-71-mir-dock.mpg         |1.409035595E9|
|/shuttle/missions/sts-71/movies/sts-71-tcdt-crew-walkout.mpg|1.137114616E9|
|/shuttle/missions/sts-70/movies/sts-70-launch.mpg           |1.098853893E9|
|/shuttle/technology/sts-newsref/stsref-toc.html             |1.061238918E9|
|/shuttle/missions/sts-53/movies/sts-53-launch.mpg           |1.034715432E9|
|/shuttle/missions/sts-69/count69.gif                        |1.005927794E9|
|/shuttle/countdown/video/livevideo2.gif                     |9.81889941E8 |
|/shuttle/countdown/count70.gif                              |9.20583544E8 |
|/shuttle/countdown/count.gif                                |8.2991008E8  |

Además, queremos saber que recurso de nuestra web es el que más tráfico recibe. Es decir, el recurso con más registros en nuestro log.

In [24]:
preg5 = logs_df_parquet.groupBy(F.col('endpoint')).agg(F.count(F.col('endpoint')).alias('count'))\
                                        .orderBy(F.col('count').desc()).show(truncate=False)

+-----------------------------------------------+------+
|endpoint                                       |count |
+-----------------------------------------------+------+
|/images/NASA-logosmall.gif                     |208714|
|/images/KSC-logosmall.gif                      |164970|
|/images/MOSAIC-logosmall.gif                   |127908|
|/images/USA-logosmall.gif                      |127074|
|/images/WORLD-logosmall.gif                    |125925|
|/images/ksclogo-medium.gif                     |121572|
|/ksc.html                                      |83909 |
|/images/launch-logo.gif                        |76006 |
|/history/apollo/images/apollo-logo1.gif        |68896 |
|/shuttle/countdown/                            |64736 |
|/                                              |63171 |
|/images/ksclogosmall.gif                       |61393 |
|/shuttle/missions/missions.html                |47315 |
|/images/launchmedium.gif                       |40687 |
|/htbin/cdt_main.pl            

¿Qué días la web recibió más tráfico?

In [25]:
logs_df2_parquet = logs_df_parquet.withColumn('dia',F.substring_index(F.col("timestamp"), ":", 1))

preg6 = logs_df2_parquet.groupBy(F.col('dia')).agg(F.count(F.col('endpoint')).alias('count'))\
                                        .orderBy(F.col('count').desc()).show(truncate=False)

+-----------+------+
|dia        |count |
+-----------+------+
|13/Jul/1995|134203|
|06/Jul/1995|100960|
|05/Jul/1995|94575 |
|12/Jul/1995|92536 |
|31/Aug/1995|90125 |
|03/Jul/1995|89584 |
|07/Jul/1995|87233 |
|14/Jul/1995|84103 |
|30/Aug/1995|80641 |
|11/Jul/1995|80407 |
|17/Jul/1995|74981 |
|10/Jul/1995|72860 |
|19/Jul/1995|72738 |
|04/Jul/1995|70452 |
|29/Aug/1995|67988 |
|20/Jul/1995|66593 |
|01/Jul/1995|64714 |
|21/Jul/1995|64629 |
|18/Jul/1995|64282 |
|24/Jul/1995|64259 |
+-----------+------+
only showing top 20 rows



¿Cuáles son los hosts son los más frecuentes?

In [26]:
preg7 = logs_df_parquet.groupBy(F.col('host')).count()\
                                        .orderBy(F.col('count').desc()).show(truncate=False)

+--------------------+-----+
|host                |count|
+--------------------+-----+
|piweba3y.prodigy.com|21988|
|piweba4y.prodigy.com|16437|
|piweba1y.prodigy.com|12825|
|edams.ksc.nasa.gov  |11964|
|163.206.89.4        |9697 |
|news.ti.com         |8161 |
|www-d1.proxy.aol.com|8047 |
|alyssa.prodigy.com  |8037 |
|                    |7661 |
|siltb10.orl.mmc.com |7573 |
|www-a2.proxy.aol.com|7516 |
|www-b2.proxy.aol.com|7266 |
|piweba2y.prodigy.com|7246 |
|www-b3.proxy.aol.com|7218 |
|www-d4.proxy.aol.com|7211 |
|www-b5.proxy.aol.com|7080 |
|www-d2.proxy.aol.com|6984 |
|www-b4.proxy.aol.com|6972 |
|www-d3.proxy.aol.com|6895 |
|webgate1.mot.com    |6749 |
+--------------------+-----+
only showing top 20 rows



In [27]:
logs_df_completo.filter(F.col('host').isin(''))\
                         .select("host","value").show(2000,truncate=False)

+----+--------------------------------------------------------------------------------------------------------------------------------+
|host|value                                                                                                                           |
+----+--------------------------------------------------------------------------------------------------------------------------------+
|    |triton - - [01/Aug/1995:09:56:30 -0400] "GET /ksc.html HTTP/1.0" 200 7280                                                       |
|    |triton - - [01/Aug/1995:09:56:31 -0400] "GET /images/MOSAIC-logosmall.gif HTTP/1.0" 200 363                                     |
|    |triton - - [01/Aug/1995:09:56:31 -0400] "GET /images/NASA-logosmall.gif HTTP/1.0" 200 786                                       |
|    |triton - - [01/Aug/1995:09:56:31 -0400] "GET /images/ksclogo-medium.gif HTTP/1.0" 200 5866                                      |
|    |triton - - [01/Aug/1995:09:56:31 -0400] "G

¿A qué horas se produce el mayor número de tráfico en la web?

In [28]:
logs_df3_parquet = logs_df_parquet.withColumn('hora',F.substring(F.col("timestamp"), 13,2))

preg8 = logs_df3_parquet.groupBy(F.col('hora')).agg(F.count(F.col('endpoint')).alias('count'))\
                                        .orderBy(F.col('count').desc()).show(truncate=False)

+----+------+
|hora|count |
+----+------+
|15  |230665|
|12  |227228|
|13  |225350|
|14  |223873|
|16  |217564|
|11  |211064|
|10  |193816|
|09  |178664|
|17  |178443|
|08  |149193|
|18  |146091|
|22  |131432|
|19  |131091|
|21  |129907|
|20  |129753|
|23  |123932|
|00  |110312|
|07  |101403|
|01  |91597 |
|02  |77805 |
+----+------+
only showing top 20 rows



¿Cuál es el número de errores 404 que ha habido cada día?

In [29]:
preg9 = logs_df2_parquet.where("status == '404'").groupBy(F.col('dia')).count()\
                                        .show(truncate=False)

+-----------+-----+
|dia        |count|
+-----------+-----+
|02/Jul/1995|291  |
|21/Aug/1995|305  |
|06/Aug/1995|373  |
|16/Jul/1995|257  |
|07/Aug/1995|537  |
|11/Aug/1995|263  |
|27/Jul/1995|336  |
|07/Jul/1995|570  |
|17/Jul/1995|406  |
|15/Jul/1995|254  |
|18/Jul/1995|465  |
|26/Jul/1995|336  |
|03/Aug/1995|304  |
|18/Aug/1995|256  |
|17/Aug/1995|271  |
|14/Aug/1995|287  |
|10/Jul/1995|398  |
|04/Jul/1995|359  |
|20/Aug/1995|312  |
|20/Jul/1995|428  |
+-----------+-----+
only showing top 20 rows

