## To see if all works correctly

In [1]:
spark

<pyspark.sql.session.SparkSession at 0x7fdf05027910>

## Load modules

In [4]:
from pyspark.sql import Row
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import numpy as np
import pylab as P
import pandas as pd
plt.rcdefaults()

### Read data from csv file

In [3]:
folder="/home/herimanitra/Téléchargements/apache2015*"
# reading option 1:
#dcsv = spark.read.csv(folder, header=True, inferSchema=False)
# reading option 2:
dcsv=sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true').option("escape", ":").option("delimiter", ",").load(folder)

## How many rows:

In [4]:
dcsv.count()

139261

### Printing schema

In [9]:
dcsv.printSchema()

root
 |-- content: string (nullable = true)
 |-- path: string (nullable = true)
 |-- repo_name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- org_id: integer (nullable = true)
 |-- created_at: string (nullable = true)
 |-- PRcreated_at: string (nullable = true)
 |-- action: string (nullable = true)
 |-- prbody: string (nullable = true)
 |-- prtitle: string (nullable = true)



## SPLITTING BY LOC without nulling other columns:

In [5]:
#dcsv.registerTempTable("dcsv")
#dcsv1=sqlContext.sql(""" SELECT  split(content, 'pointvirgule')[0] as line,content,path,repo_name,type,org_id,created_at,PRcreated_at,action,prbody,prtitle FROM dcsv  """)
from pyspark.sql.functions import split, explode
dcsv1=dcsv.select(explode(split(col("content"), "pointvirgule")).alias("line"),col("content"),
             col("path"),col("repo_name"),col("type"),col("org_id"),col("created_at"),
             col("PRcreated_at"),col("action"),col("prbody"),col("prtitle"))


In [6]:
dcsv1.select(['line','content','repo_name']).show(150)

+--------------------+--------------------+--------------------+
|                line|             content|           repo_name|
+--------------------+--------------------+--------------------+
|/**  * Licensed t...|/**  * Licensed t...|apache/incubator-...|
|  you may not use...|/**  * Licensed t...|apache/incubator-...|
|   import java.ne...|/**  * Licensed t...|apache/incubator-...|
|  import java.uti...|/**  * Licensed t...|apache/incubator-...|
|   import com.goo...|/**  * Licensed t...|apache/incubator-...|
|   import org.cod...|/**  * Licensed t...|apache/incubator-...|
|  import org.juni...|/**  * Licensed t...|apache/incubator-...|
|  import org.juni...|/**  * Licensed t...|apache/incubator-...|
|  import org.juni...|/**  * Licensed t...|apache/incubator-...|
|  import org.juni...|/**  * Licensed t...|apache/incubator-...|
|   import com.dat...|/**  * Licensed t...|apache/incubator-...|
|   /**  * Test fo...|/**  * Licensed t...|apache/incubator-...|
|      Map<Object ...|/**

## Modifying existing  column by creating new DF

In [7]:
dcsv2 = dcsv1.withColumn("id", monotonically_increasing_id())
dcsv2.select(['id','content','repo_name']).show()

+---+--------------------+--------------------+
| id|             content|           repo_name|
+---+--------------------+--------------------+
|  0|/**  * Licensed t...|apache/incubator-...|
|  1|/**  * Licensed t...|apache/incubator-...|
|  2|/**  * Licensed t...|apache/incubator-...|
|  3|/**  * Licensed t...|apache/incubator-...|
|  4|/**  * Licensed t...|apache/incubator-...|
|  5|/**  * Licensed t...|apache/incubator-...|
|  6|/**  * Licensed t...|apache/incubator-...|
|  7|/**  * Licensed t...|apache/incubator-...|
|  8|/**  * Licensed t...|apache/incubator-...|
|  9|/**  * Licensed t...|apache/incubator-...|
| 10|/**  * Licensed t...|apache/incubator-...|
| 11|/**  * Licensed t...|apache/incubator-...|
| 12|/**  * Licensed t...|apache/incubator-...|
| 13|/**  * Licensed t...|apache/incubator-...|
| 14|/**  * Licensed t...|apache/incubator-...|
| 15|/**  * Licensed t...|apache/incubator-...|
| 16|/**  * Licensed t...|apache/incubator-...|
| 17|/**  * Licensed t...|apache/incubat

## remove rows with non code things

In [9]:
def removeNonCode(colval):
    import re
    if  re.match(r'^//|/[*]|/[**]',colval) is not None:
        return 0
    else:
        return 1
from pyspark.sql.functions import udf    
changeColUDF = udf(removeNonCode,IntegerType())
dcsv4 = dcsv3.withColumn("iscode", changeColUDF("line"))
#& col('repo_name').isNotNull()
dcsv5= dcsv4.where( (col('iscode')==1)  )
dcsv5.select(['id','line','repo_name']).show()

+---+--------------------+--------------------+
| id|                line|           repo_name|
+---+--------------------+--------------------+
|  1|you may not use t...|apache/incubator-...|
|  2|import java.net.M...|apache/incubator-...|
|  3|import java.util.Map|apache/incubator-...|
|  4|import com.google...|apache/incubator-...|
|  5|import org.codeha...|apache/incubator-...|
|  6|import org.junit....|apache/incubator-...|
|  7|import org.junit....|apache/incubator-...|
|  8|import org.junit....|apache/incubator-...|
|  9|import org.junit....|apache/incubator-...|
| 10|import com.datato...|apache/incubator-...|
| 12|Map<Object virgul...|apache/incubator-...|
| 13|tuple.put( quote ...|apache/incubator-...|
| 14|tuple.put( quote ...|apache/incubator-...|
| 15|tuple.put( quote ...|apache/incubator-...|
| 16|MapBasedCouchDbOu...|apache/incubator-...|
| 17|CouchDbStore stor...|apache/incubator-...|
| 18|store.setDbName(C...|apache/incubator-...|
| 19|dbOutputOper.setS...|apache/incubat

# How many LOC per repo do we have in apache project for this particular csv file?

In [35]:
dcsv5.select(['line','repo_name']).groupBy('repo_name').count()

DataFrame[repo_name: string, count: bigint]

### Adding new columns based on existing one

In [10]:
def changeCol(colval):
    import re
    matchObj = re.search( 'if', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified1 = dcsv5.withColumn("ifstatement", changeColUDF("line"))
dfmodified1.select(['id','ifstatement','repo_name','line']).show()

+---+-----------+--------------------+--------------------+
| id|ifstatement|           repo_name|                line|
+---+-----------+--------------------+--------------------+
|  1|          1|apache/incubator-...|you may not use t...|
|  2|          0|apache/incubator-...|import java.net.M...|
|  3|          0|apache/incubator-...|import java.util.Map|
|  4|          0|apache/incubator-...|import com.google...|
|  5|          0|apache/incubator-...|import org.codeha...|
|  6|          0|apache/incubator-...|import org.junit....|
|  7|          0|apache/incubator-...|import org.junit....|
|  8|          0|apache/incubator-...|import org.junit....|
|  9|          0|apache/incubator-...|import org.junit....|
| 10|          0|apache/incubator-...|import com.datato...|
| 12|          0|apache/incubator-...|Map<Object virgul...|
| 13|          0|apache/incubator-...|tuple.put( quote ...|
| 14|          0|apache/incubator-...|tuple.put( quote ...|
| 15|          0|apache/incubator-...|tu

## Filter selection from a DataFrame

In [11]:
dfmodified1.select(["id","ifstatement","repo_name","line"]).where(dfmodified1["ifstatement"]=='1').show()

+---+-----------+--------------------+--------------------+
| id|ifstatement|           repo_name|                line|
+---+-----------+--------------------+--------------------+
|  1|          1|apache/incubator-...|you may not use t...|
| 98|          1|apache/incubator-...|you may not use t...|
|110|          1|apache/incubator-...|unifier.aggregati...|
|111|          1|apache/incubator-...|unifier.beginWind...|
|132|          1|apache/incubator-...|unifier.process(t...|
|133|          1|apache/incubator-...|unifier.process(t...|
|134|          1|apache/incubator-...|unifier.process(t...|
|135|          1|apache/incubator-...|unifier.process(t...|
|136|          1|apache/incubator-...|unifier.process(t...|
|137|          1|apache/incubator-...| unifier.endWindow()|
|142|          1|apache/incubator-...|if (key.equals( q...|
|143|          1|apache/incubator-...|} if (key.equals(...|
|144|          1|apache/incubator-...|} if (key.equals(...|
|147|          1|    apache/metamodel|yo

### insert new column to indicate start of a bloc code { in java

In [12]:
def changeCol(colval):
    import re
    matchObj = re.search( '{', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified2 = dfmodified1.withColumn("startblock", changeColUDF("line"))
dfmodified2.select(['id','ifstatement','startblock','repo_name','line']).where(dfmodified2["startblock"]=='1').show()

+---+-----------+----------+--------------------+--------------------+
| id|ifstatement|startblock|           repo_name|                line|
+---+-----------+----------+--------------------+--------------------+
| 33|          0|         1|apache/incubator-...|} @Test public vo...|
| 64|          0|         1|apache/incubator-...|} public class Te...|
| 72|          0|         1|apache/incubator-...|public float getF...|
| 73|          0|         1|apache/incubator-...|} public void set...|
| 74|          0|         1|apache/incubator-...|} public char get...|
| 75|          0|         1|apache/incubator-...|} public void set...|
| 76|          0|         1|apache/incubator-...|} public TestPOJO...|
| 77|          0|         1|apache/incubator-...|} public void set...|
| 78|          0|         1|apache/incubator-...|} public String g...|
| 79|          0|         1|apache/incubator-...|} public void set...|
| 80|          0|         1|apache/incubator-...|} public String g...|
| 81| 

In [13]:
def changeCol(colval):
    import re
    matchObj = re.search( '}', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
changeColUDF = udf(changeCol,StringType())
dfmodified3 = dfmodified2.withColumn("endblock", changeColUDF("line"))
dfmodified3.select(['id','ifstatement','startblock','endblock','repo_name','line']).where(col("endblock")=='1').show()

+---+-----------+----------+--------+--------------------+--------------------+
| id|ifstatement|startblock|endblock|           repo_name|                line|
+---+-----------+----------+--------+--------------------+--------------------+
| 33|          0|         1|       1|apache/incubator-...|} @Test public vo...|
| 64|          0|         1|       1|apache/incubator-...|} public class Te...|
| 73|          0|         1|       1|apache/incubator-...|} public void set...|
| 74|          0|         1|       1|apache/incubator-...|} public char get...|
| 75|          0|         1|       1|apache/incubator-...|} public void set...|
| 76|          0|         1|       1|apache/incubator-...|} public TestPOJO...|
| 77|          0|         1|       1|apache/incubator-...|} public void set...|
| 78|          0|         1|       1|apache/incubator-...|} public String g...|
| 79|          0|         1|       1|apache/incubator-...|} public void set...|
| 80|          0|         1|       1|apa

## While statement:

In [14]:
def changeCol(colval):
    import re
    matchObj = re.search( '^while', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified4 = dfmodified3.withColumn("whilestatement", changeColUDF("line"))
dfmodified4.select(['id','startblock','endblock','whilestatement','repo_name','line']).where(col("whilestatement")=='1').show()

+----+----------+--------+--------------+--------------------+--------------------+
|  id|startblock|endblock|whilestatement|           repo_name|                line|
+----+----------+--------+--------------+--------------------+--------------------+
| 597|         1|       0|             1|apache/incubator-...|while (!stop) { c...|
|1029|         1|       0|             1|         apache/oodt|while (configMetK...|
|1270|         1|       0|             1|         apache/oodt|while (rs.next())...|
|1328|         1|       0|             1|         apache/oodt|while (rs.next())...|
|1347|         1|       0|             1|         apache/oodt|while (rs.next())...|
|1367|         1|       0|             1|         apache/oodt|while (rs.next())...|
|1533|         1|       0|             1|apache/incubator-...|while (this.isRun...|
|2454|         1|       0|             1|        apache/camel|while (verbit1.ha...|
|2460|         1|       0|             1|        apache/camel|while (it.hasN

### For statement

In [15]:
def changeCol(colval):
    import re
    matchObj = re.search( '^for [(]', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified5 = dfmodified4.withColumn("forstatement", changeColUDF("line"))
dfmodified5.select(['id','endblock','whilestatement','forstatement','repo_name','line']).where(col("forstatement")=='1').show()

+----+--------+--------------+------------+--------------------+--------------------+
|  id|endblock|whilestatement|forstatement|           repo_name|                line|
+----+--------+--------------+------------+--------------------+--------------------+
| 140|       0|             0|           1|apache/incubator-...|for (Map<String v...|
| 399|       0|             0|           1|        apache/camel|for (Map<String v...|
| 530|       0|             0|           1|        apache/camel|for (Route route ...|
| 546|       0|             0|           1|        apache/camel|for (RevCommit co...|
| 598|       0|             0|           1|apache/incubator-...|        for (int j=0|
| 914|       0|             0|           1|         apache/oodt|      for (int i = 0|
| 952|       0|             0|           1|         apache/oodt|for (Object heade...|
| 993|       0|             0|           1|         apache/oodt|for (Map.Entry<Ob...|
|1424|       0|             0|           1|apache/incu

### Switch statement

In [16]:
def changeCol(colval):
    import re
    matchObj = re.search( '^switch', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
changeColUDF = udf(changeCol,StringType())
dfmodified6 = dfmodified5.withColumn("switchstmt", changeColUDF("line"))
mycols=['id','whilestatement','forstatement','switchstmt','repo_name','line']
dfmodified6.select(mycols).where(col("switchstmt")=='1').show()

+-----+--------------+------------+----------+--------------------+--------------------+
|   id|whilestatement|forstatement|switchstmt|           repo_name|                line|
+-----+--------------+------------+----------+--------------------+--------------------+
| 6590|             0|           0|         1|apache/incubator-...|switch (callType)...|
| 9637|             0|           0|         1|        apache/camel|switch (operation...|
|10352|             0|           0|         1|apache/jackrabbit...|switch(timeframe)...|
|11607|             0|           0|         1|        apache/camel|switch (operation...|
|12007|             0|           0|         1|        apache/camel|switch (operation...|
|13248|             0|           0|         1|apache/incubator-...|switch (path.size...|
|20029|             0|           0|         1|        apache/camel|switchToDefaultCh...|
|22110|             0|           0|         1|apache/incubator-...|switch (path.size...|
|22168|             0

### affectation equals =

In [17]:
def changeCol(colval):
    import re
    matchObj = re.search( '=', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified7 = dfmodified6.withColumn("affectation", changeColUDF("line"))
mycols=['id','forstatement','switchstmt','affectation','repo_name','line']
dfmodified7.select(mycols).where(col("affectation")=='1').show()

+---+------------+----------+-----------+--------------------+--------------------+
| id|forstatement|switchstmt|affectation|           repo_name|                line|
+---+------------+----------+-----------+--------------------+--------------------+
| 12|           0|         0|          1|apache/incubator-...|Map<Object virgul...|
| 16|           0|         0|          1|apache/incubator-...|MapBasedCouchDbOu...|
| 17|           0|         0|          1|apache/incubator-...|CouchDbStore stor...|
| 33|           0|         0|          1|apache/incubator-...|} @Test public vo...|
| 34|           0|         0|          1|apache/incubator-...|TestPOJO tuple = ...|
| 41|           0|         0|          1|apache/incubator-...|Address address =...|
| 45|           0|         0|          1|apache/incubator-...|CouchDBPOJOOutput...|
| 46|           0|         0|          1|apache/incubator-...|CouchDbStore stor...|
| 49|           0|         0|          1|apache/incubator-...|String express

## isEqual ==

In [18]:
def changeCol(colval):
    import re
    matchObj = re.search( '==', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified8 = dfmodified7.withColumn("equality", changeColUDF("line"))
mycols=['id','switchstmt','affectation','equality','repo_name','line']
dfmodified8.select(mycols).where(col("equality")=='1').show()

+----+----------+-----------+--------+----------------+--------------------+
|  id|switchstmt|affectation|equality|       repo_name|                line|
+----+----------+-----------+--------+----------------+--------------------+
| 171|         0|          1|       1|apache/metamodel|@Override public ...|
| 172|         0|          1|       1|apache/metamodel|} if (max == null...|
| 214|         0|          1|       1|    apache/camel|from( quote cxf:b...|
| 486|         0|          1|       1|    apache/camel|} @Override publi...|
| 512|         0|          1|       1|    apache/camel|} public CamelLog...|
| 517|         0|          1|       1|    apache/camel|} private int get...|
| 524|         0|          1|       1|    apache/camel|if (scope == Thro...|
| 528|         0|          1|       1|    apache/camel|if (scope == Thro...|
| 786|         0|          1|       1|     apache/oodt|} /* * (non-Javad...|
| 796|         0|          1|       1|     apache/oodt|if (finalFilePath...|

### Logical operation

In [19]:
def changeCol(colval):
    import re
    matchObj = re.search( '!=|==', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified9 = dfmodified8.withColumn("logicOp", changeColUDF("line"))
mycols=['id','switchstmt','affectation','equality','logicOp','repo_name','line']
dfmodified9.select(mycols).where(col("logicOp")=='1').show()

+---+----------+-----------+--------+-------+----------------+--------------------+
| id|switchstmt|affectation|equality|logicOp|       repo_name|                line|
+---+----------+-----------+--------+-------+----------------+--------------------+
|171|         0|          1|       1|      1|apache/metamodel|@Override public ...|
|172|         0|          1|       1|      1|apache/metamodel|} if (max == null...|
|214|         0|          1|       1|      1|    apache/camel|from( quote cxf:b...|
|486|         0|          1|       1|      1|    apache/camel|} @Override publi...|
|512|         0|          1|       1|      1|    apache/camel|} public CamelLog...|
|517|         0|          1|       1|      1|    apache/camel|} private int get...|
|524|         0|          1|       1|      1|    apache/camel|if (scope == Thro...|
|528|         0|          1|       1|      1|    apache/camel|if (scope == Thro...|
|668|         0|          1|       0|      1|    apache/camel|} @Override pr

## Arithmetic [+]|-|%|/|[*] 

In [20]:
def changeCol(colval):
    import re
    matchObj = re.search( '[+]|-|%|/|[*]', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified10= dfmodified9.withColumn("arithmetic", changeColUDF("line"))
dfmodified10.select(['id','equality','logicOp','arithmetic','repo_name','line']).where(col("arithmetic")=='1').show()

+---+--------+-------+----------+--------------------+--------------------+
| id|equality|logicOp|arithmetic|           repo_name|                line|
+---+--------+-------+----------+--------------------+--------------------+
|  1|       0|      0|         1|apache/incubator-...|you may not use t...|
| 24|       0|      0|         1|apache/incubator-...|tuple.put( quote ...|
| 32|       0|      0|         1|apache/incubator-...|Assert.assertEqua...|
| 61|       0|      0|         1|apache/incubator-...|Assert.assertEqua...|
| 98|       0|      0|         1|apache/incubator-...|you may not use t...|
|142|       0|      0|         1|apache/incubator-...|if (key.equals( q...|
|143|       0|      0|         1|apache/incubator-...|} if (key.equals(...|
|144|       0|      0|         1|apache/incubator-...|} if (key.equals(...|
|147|       0|      0|         1|    apache/metamodel|you may not use t...|
|167|       0|      0|         1|    apache/metamodel|you may not use t...|
|178|       

## relations : ">|<|>=|<="

In [21]:
def changeCol(colval):
    import re
    matchObj = re.search( '[>]|[<]|[>=]|[<=]', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified11= dfmodified10.withColumn("relation", changeColUDF("line"))
mycols=['id','logicOp','arithmetic','relation','repo_name','line']
dfmodified11.select(mycols).where(col("relation")=='1').show()

+---+-------+----------+--------+--------------------+--------------------+
| id|logicOp|arithmetic|relation|           repo_name|                line|
+---+-------+----------+--------+--------------------+--------------------+
| 12|      0|         0|       1|apache/incubator-...|Map<Object virgul...|
| 16|      0|         0|       1|apache/incubator-...|MapBasedCouchDbOu...|
| 17|      0|         0|       1|apache/incubator-...|CouchDbStore stor...|
| 33|      0|         0|       1|apache/incubator-...|} @Test public vo...|
| 34|      0|         0|       1|apache/incubator-...|TestPOJO tuple = ...|
| 41|      0|         0|       1|apache/incubator-...|Address address =...|
| 45|      0|         0|       1|apache/incubator-...|CouchDBPOJOOutput...|
| 46|      0|         0|       1|apache/incubator-...|CouchDbStore stor...|
| 49|      0|         0|       1|apache/incubator-...|String expression...|
| 73|      0|         0|       1|apache/incubator-...|} public void set...|
| 75|      0

## EXTRACT class name:

In [23]:
def changeCol(colval):
    import re
    matchObj = re.search("^public class (\w+)", colval)
    if  matchObj is not None: 
        return matchObj.group(1)
    else:
        return ''
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified13= dfmodified12.withColumn("className", changeColUDF("line"))
dfmodified13.select(['class','className','path','line']).where(dfmodified13["className"]!='').show()

+-----+--------------------+--------------------+--------------------+
|class|           className|                path|                line|
+-----+--------------------+--------------------+--------------------+
|    1|SqlGeneratedKeysTest|components/camel-...|public class SqlG...|
|    1|   GitCommitConsumer|components/camel-...|public class GitC...|
|    1|        StatusThread|core/conn/trafci/...|public class Stat...|
|    1|NettyHttpHeadersTest|components/camel-...|public class Nett...|
|    1|     ObjectContracts|core/applib/src/m...|public class Obje...|
|    1|         XsltDTDTest|camel-core/src/te...|public class Xslt...|
|    1|       CamelOperator|components/camel-...|public class Came...|
|    1|    SjmsEndpointTest|components/camel-...|public class Sjms...|
|    1|     ScanCommandTest|components/camel-...|public class Scan...|
|    1|JasyptPropertiesP...|components/camel-...|public class Jasy...|
|    1|JacksonIncludeDef...|components/camel-...|public class Jack...|
|    1

### Look at importation

In [25]:
def changeCol(colval):
    import re
    matchObj = re.search("^import", colval)
    if  matchObj is not None: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified14= dfmodified13.withColumn("importation", changeColUDF("line"))
dfmodified14.select(['class','className','importation','line']).where(dfmodified14["importation"]=='1').show()

+-----+---------+-----------+--------------------+
|class|className|importation|                line|
+-----+---------+-----------+--------------------+
|    0|         |          1|import java.net.M...|
|    0|         |          1|import java.util.Map|
|    0|         |          1|import com.google...|
|    0|         |          1|import org.codeha...|
|    0|         |          1|import org.junit....|
|    0|         |          1|import org.junit....|
|    0|         |          1|import org.junit....|
|    0|         |          1|import org.junit....|
|    0|         |          1|import com.datato...|
|    0|         |          1|import java.util....|
|    0|         |          1|import java.util....|
|    0|         |          1|import java.util.Map|
|    0|         |          1|import java.util....|
|    0|         |          1|import org.junit....|
|    0|         |          1|import org.junit....|
|    0|         |          1|import org.apache...|
|    0|         |          1|im

## New instance *= new | this.

In [27]:
def changeCol(colval):
    import re
    matchObj = re.search("this[.](\w+)|(\w+)[.]", colval)
    if  matchObj is not None: 
        return matchObj.group(1)
    else:
        return ''
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified15= dfmodified14.withColumn("className2", changeColUDF("line"))
dfmodified15.select(['class','className','className2','line']).where(dfmodified15["className2"]!='').show()

+-----+---------+------------------+--------------------+
|class|className|        className2|                line|
+-----+---------+------------------+--------------------+
|    0|         |                fl|} public void set...|
|    0|         |                ch|} public void set...|
|    0|         |           address|} public void set...|
|    0|         |          revision|} public void set...|
|    0|         |       output_type|} public void set...|
|    0|         |               _id|} public void set...|
|    0|         |              name|} public void set...|
|    0|         |              type|} public void set...|
|    0|         |       housenumber|} public void set...|
|    0|         |              city|} public void set...|
|    0|         |         instances|public FlexUpClus...|
|    0|         |           profile|this.profile = pr...|
|    0|         |       constraints|this.constraints ...|
|    0|         |         instances|} public void set...|
|    0|       

In [29]:
def changeCol(colval):
    import re
    matchObj = re.search("new (\w+)", colval)
    if  matchObj is not None: 
        return matchObj.group(1)
    else:
        return ''
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified16= dfmodified15.withColumn("className3", changeColUDF("line"))
dfmodified16.select(['className','className2','className3','line']).where(dfmodified16["className3"]!='').show()

+---------+----------+--------------------+--------------------+
|className|className2|          className3|                line|
+---------+----------+--------------------+--------------------+
|         |          |MapBasedCouchDbOu...|MapBasedCouchDbOu...|
|         |          |        CouchDbStore|CouchDbStore stor...|
|         |      null|OperatorContextTe...|dbOutputOper.setu...|
|         |          |            TestPOJO|TestPOJO tuple = ...|
|         |          |             Address|Address address =...|
|         |          |CouchDBPOJOOutput...|CouchDBPOJOOutput...|
|         |          |        CouchDbStore|CouchDbStore stor...|
|         |      null|OperatorContextTe...|dbOutputOper.setu...|
|         |          |   CollectorTestSink|CollectorTestSink...|
|         |          |             HashMap|Map<String virgul...|
|         |      null|     DimensionObject|tuple1.put( quote...|
|         |      null|     DimensionObject|tuple1.put( quote...|
|         |      null|   

## Another class Name

In [31]:
def changeCol(colval):
    import re
    matchObj = re.search("^(\w+) = new", colval)
    if  matchObj is not None: 
        return matchObj.group(1)
    else:
        return ''
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified17= dfmodified16.withColumn("className4", changeColUDF("line"))
dfmodified17.select(['className','className2','className3','className4','line']).where(dfmodified17["className4"]!='').show()

+---------+----------+--------------------+--------------------+--------------------+
|className|className2|          className3|          className4|                line|
+---------+----------+--------------------+--------------------+--------------------+
|         |          |           ArrayList|              values|values = new Arra...|
|         |          |              Vector|        productTypes|productTypes = ne...|
|         |          |CamelContextFacto...|             factory|factory = new Cam...|
|         |          |    DdbConfiguration|       configuration|configuration = n...|
|         |          |     DefaultExchange|            exchange|exchange = new De...|
|         |          |         ScanCommand|             command|command = new Sca...|
|         |          |         JmsTemplate|                 jms|jms = new JmsTemp...|
|         |          |             HashMap|  namespacePrefixMap|namespacePrefixMa...|
|         |      null|  SoapJaxbDataFormat|       soap

# Is main entry program?

In [32]:
def changeCol(colval):
    import re
    matchObj = re.search("public static void main", colval)
    if  matchObj is not None: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
changeColUDF = udf(changeCol,StringType())
dfmodified18= dfmodified17.withColumn("main", changeColUDF("line"))
dfmodified18.select(['className2','className3','main','line']).where(dfmodified18["main"]=='1').show()

+----------+--------------------+----+--------------------+
|className2|          className3|main|                line|
+----------+--------------------+----+--------------------+
|          | MesosExecutorDriver|   1|} public static v...|
|          |TikaCmdLineMetExt...|   1|} } public static...|
|      null|                    |   1|} /** * Emits a o...|
|          |                    |   1|public static voi...|
|      null|                    |   1|} public static v...|
|          |                    |   1|} } public static...|
|          |                 URI|   1|} public static v...|
|          |                    |   1|} public static v...|
|          |                    |   1|public class Prep...|
|          |                    |   1|public static voi...|
|          |             Options|   1|} public static v...|
|          |                    |   1|public class Stat...|
|      null|                    |   1|} } /** For debug...|
|      null|           StatsTest|   1|} 

In [33]:
dfmodified18.count()

1744048

## Is test?

# Turn a DataFrame into an Resilient Distributed Data (RDD)

In [52]:
myrdd= dfmodified3.rdd
myrdd

MapPartitionsRDD[143] at javaToPython at NativeMethodAccessorImpl.java:0

## How to remove a column

In [None]:
dfmodified3 = dfmodified2.drop('newCOl')

### Read data from json file

In [2]:
df = sqlContext.read.format('json').load("/home/herimanitra/Téléchargements/apachajava2016000000000000.json")

In [3]:
df = spark.read.json("/home/herimanitra/Téléchargements/apachajava2016000000000000.json")

In [9]:
def countLOC (row):
    return row.split()

## filter

In [44]:
dp = sc.textFile('/home/herimanitra/Téléchargements/apachajava2016000000000000.json')

In [16]:
#dp.map(countLOC).take(2)

## Another way:

In [14]:
df.createOrReplaceTempView("df")

In [None]:
df = df.dropDuplicates()

In [None]:
spark.sql("select  action,PRcreated_at,prtitle,prbody ,repo_name,content from df  ").show(5) 
#.collect()

In [None]:
import pyspark.sql.functions as fn

df.agg(
    fn.count('prbody').alias('prbodyCount'),
    fn.countDistinct('repo_name').alias('reponameCount')
).show()

In [93]:
def splitMe (row):
    x=row.split(";")
    return len(x)

In [7]:
df.select('content').rdd.flatMap( lambda line: line) .map(splitMe).take(2)

In [67]:
df

DataFrame[PRcreated_at: string, action: string, content: string, created_at: string, org_id: string, org_url: string, path: string, prbody: string, prtitle: string, repo_name: string, type: string]

In [45]:
splitLines= dp.map(lambda l: l.split(";"))

In [46]:
X= splitLines.map(lambda col: Row(PRcreated_at=col[0], 
                                  action=col[1],
                                  content=col[2] ,
                                  created_at=col[3],
                                  org_id=col[4],
                                  org_url=col[5],path=col[6],
                                  prbody=col[7],prtitle=col[8],repo_name=col[9],type=col[10]) )

## Convert to dataframe

In [47]:
Xdf= spark.createDataFrame(X)

## One hot encoding

In [1]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [2]:
df = spark.createDataFrame([
    (0, "a"),
    (1, "b"),
    (2, "c"),
    (3, "a"),
    (4, "a"),
    (5, "c")
], ["id", "category"])

In [3]:
df.show()

+---+--------+
| id|category|
+---+--------+
|  0|       a|
|  1|       b|
|  2|       c|
|  3|       a|
|  4|       a|
|  5|       c|
+---+--------+



In [4]:
stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)
encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
encoded.show()

+---+--------+-------------+-------------+
| id|category|categoryIndex|  categoryVec|
+---+--------+-------------+-------------+
|  0|       a|          0.0|(2,[0],[1.0])|
|  1|       b|          2.0|    (2,[],[])|
|  2|       c|          1.0|(2,[1],[1.0])|
|  3|       a|          0.0|(2,[0],[1.0])|
|  4|       a|          0.0|(2,[0],[1.0])|
|  5|       c|          1.0|(2,[1],[1.0])|
+---+--------+-------------+-------------+



# New datasets from APACHE FOUNDATION

## Read csv.gz


In [1]:
folder="/home/herimanitra/Téléchargements/data-master/apache20*.csv.gz"
mydata = spark.read.csv(folder, header=True, inferSchema=True)
mydata.select(["content","sample_repo_name","avg_commiter_timestamp","sample_path"]).show()

+--------------------+--------------------+----------------------+--------------------+
|             content|    sample_repo_name|avg_commiter_timestamp|         sample_path|
+--------------------+--------------------+----------------------+--------------------+
|/ etoile   etoile...|apache/incubator-...|  2015-10-30 18:42:...|lang/java/reef-wa...|
|/ etoile   etoile...|lburgazzoli/apach...|  2015-08-10 14:19:...|artemis-server/sr...|
|/ etoile   etoile...|apache/activemq-a...|  2015-08-10 14:19:...|artemis-core-clie...|
|/ etoile   etoile...|         apache/jena|  2015-10-26 02:55:...|jena-cmds/src/mai...|
|/ etoile   etoile...|         apache/jena|  2015-08-12 22:50:...|jena-core/src/mai...|
|/ etoile   etoile...|         apache/jena|  2015-08-11 16:16:...|jena-arq/src/main...|
|/ etoile   etoile...|         apache/jena|  2015-08-21 18:28:...|jena-core/src/mai...|
|/ etoile   etoile...|       apache/ignite|  2015-10-13 21:19:...|modules/core/src/...|
|/ etoile   etoile...|      apac

### print schema of contents file

In [5]:
mydata.printSchema()

root
 |-- content: string (nullable = true)
 |-- sample_repo_name: string (nullable = true)
 |-- sample_path: string (nullable = true)
 |-- min_commiter_timestamp: string (nullable = true)
 |-- min_author_timestamp: string (nullable = true)
 |-- min_committer_time: integer (nullable = true)
 |-- avg_commiter_timestamp: string (nullable = true)
 |-- avg_committer_time: integer (nullable = true)



In [2]:
projects=mydata.select(["sample_repo_name"]).toPandas()['sample_repo_name'].unique()
projects

array([u'apache/incubator-reef', u'lburgazzoli/apache-activemq-artemis',
       u'apache/activemq-artemis', u'apache/jena', u'apache/ignite',
       u'apache/stratos', u'apache/reef', u'apache/maven-plugins',
       u'apache/incubator-tinkerpop', u'gridgain/apache-ignite',
       u'apache/activemq-openwire', u'apacheignite/ignite',
       u'apache/flex-blazeds', u'apache/santuario-java',
       u'jonpspri/apache-commons-net', u'zouzias/apache-nutch-2.3',
       u'apache/juddi', u'apache/tapestry-5', u'apache/wss4j',
       u'apache/airavata', u'apache/portals-pluto',
       u'apache/incubator-freemarker', u'apache/ode',
       u'apache/commons-lang', u'apache/zest-qi4j', u'dash-/apache-openaz',
       u'snorden/parquet-mr-apache-parquet-1.7.0', u'apache/qpid-java',
       u'apache/incubator-streams', u'apache/incubator-rya',
       u'apache/velocity-engine', u'apache/incubator-openaz',
       u'apache/commons-jexl', u'apache/sis',
       u'lburgazzoli/apache-logging-log4j2',
       u'a

In [5]:
mydata.where((col('sample_repo_name')=='apache/activemq-artemis') |
             (col('sample_repo_name')=='apache/activemq-openwire') ).show()

+--------------------+--------------------+--------------------+----------------------+--------------------+------------------+----------------------+------------------+
|             content|    sample_repo_name|         sample_path|min_commiter_timestamp|min_author_timestamp|min_committer_time|avg_commiter_timestamp|avg_committer_time|
+--------------------+--------------------+--------------------+----------------------+--------------------+------------------+----------------------+------------------+
|/ etoile   etoile...|apache/activemq-a...|artemis-core-clie...|  2015-08-10 13:26:...|2015-08-10 13:26:...|        1439213202|  2015-08-10 14:19:...|        1439216357|
|/ etoile  etoile ...|apache/activemq-o...|openwire-core/src...|  2015-12-01 19:16:...|2015-12-01 19:16:...|        1448997392|  2015-12-01 19:16:...|        1448997392|
|/ etoile  etoile ...|apache/activemq-o...|openwire-legacy/s...|  2015-10-24 23:39:...|2015-10-24 23:39:...|        1445729973|  2015-11-12 21:28:...|

## remove white space

In [7]:
name = 'line'
udf = udf(lambda x: ' '.join(x.split()), StringType())
X = mydata.where((col('sample_repo_name')=='apache/activemq-artemis') |
             (col('sample_repo_name')=='apache/activemq-openwire') ).select( *[udf(column).alias(name) if column == name else column for column in mydata.columns])
X.select(['sample_repo_name','content','avg_commiter_timestamp']).show()

+--------------------+--------------------+----------------------+
|    sample_repo_name|             content|avg_commiter_timestamp|
+--------------------+--------------------+----------------------+
|apache/activemq-a...|/ etoile   etoile...|  2015-08-10 14:19:...|
|apache/activemq-o...|/ etoile  etoile ...|  2015-12-01 19:16:...|
|apache/activemq-o...|/ etoile  etoile ...|  2015-11-12 21:28:...|
|apache/activemq-a...|/ etoile   etoile...|  2015-12-06 21:03:...|
|apache/activemq-a...|/ etoile  etoile ...|  2015-12-07 22:32:...|
|apache/activemq-a...|/ etoile   etoile...|  2015-12-22 03:01:...|
|apache/activemq-a...|/ etoile  etoile ...|  2015-12-06 21:03:...|
|apache/activemq-a...|/ etoile  etoile ...|  2015-12-06 21:03:...|
|apache/activemq-o...|/ etoile  etoile ...|  2015-11-12 21:28:...|
|apache/activemq-a...|/ etoile  etoile ...|  2015-12-06 21:03:...|
|apache/activemq-a...|/ etoile   etoile...|  2015-12-06 21:03:...|
|apache/activemq-a...|/ etoile  etoile ...|  2015-12-07 22:32:

## count number of LOC java files per 2014->2016

In [8]:
from pyspark.sql.functions import split, explode
X2=X.select(explode(split(col("content"), "pointvirgule")).alias("line"),
                          col("sample_repo_name"),col("sample_path"),col("avg_committer_time")
           )

In [9]:
X2.show()

+--------------------+--------------------+--------------------+------------------+
|                line|    sample_repo_name|         sample_path|avg_committer_time|
+--------------------+--------------------+--------------------+------------------+
|/ etoile   etoile...|apache/activemq-a...|artemis-core-clie...|        1439216357|
|  you may not use...|apache/activemq-a...|artemis-core-clie...|        1439216357|
| import org.apach...|apache/activemq-a...|artemis-core-clie...|        1439216357|
| / etoile  etoile...|apache/activemq-a...|artemis-core-clie...|        1439216357|
|                   }|apache/activemq-a...|artemis-core-clie...|        1439216357|
|/ etoile  etoile ...|apache/activemq-o...|openwire-core/src...|        1448997392|
|  you may not use...|apache/activemq-o...|openwire-core/src...|        1448997392|
| import org.apach...|apache/activemq-o...|openwire-core/src...|        1448997392|
| import org.apach...|apache/activemq-o...|openwire-core/src...|        1448

In [10]:
X2.groupBy('sample_repo_name').count().show()

+--------------------+-----+
|    sample_repo_name|count|
+--------------------+-----+
|apache/activemq-o...|35552|
|apache/activemq-a...|28281|
+--------------------+-----+



## public class (method)

In [10]:
def changeCol(colval):
    import re
    matchObj = re.search( 'public class', colval)
    if  matchObj: 
        return '1'
    else:
        return '0'
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
changeColUDF = udf(changeCol,StringType())
commonText2= X2.withColumn("class", changeColUDF("line"))
mycols=['sample_repo_name','sample_path','class','line']
commonText2.select(mycols).where(col("class")=='1').show(10)

+--------------------+--------------------+-----+--------------------+
|    sample_repo_name|         sample_path|class|                line|
+--------------------+--------------------+-----+--------------------+
|apache/activemq-o...|openwire-core/src...|    1| / etoile  etoile...|
|apache/activemq-o...|openwire-legacy/s...|    1| public class Ope...|
|apache/activemq-a...|tests/integration...|    1| / etoile  etoile...|
|apache/activemq-a...|tests/activemq5-u...|    1| public class NIO...|
|apache/activemq-a...|tests/activemq5-u...|    1| / etoile  etoile...|
|apache/activemq-o...|openwire-legacy/s...|    1| public class Ope...|
|apache/activemq-a...|tests/activemq5-u...|    1| / etoile  etoile...|
|apache/activemq-a...|tests/activemq5-u...|    1| / etoile  etoile...|
|apache/activemq-a...|tests/activemq5-u...|    1| / etoile  etoile...|
|apache/activemq-a...|tests/activemq5-u...|    1| / etoile  etoile...|
+--------------------+--------------------+-----+--------------------+
only s

In [11]:
mycols=['line','sample_repo_name','sample_path','avg_committer_time','class']
pivot = commonText2.groupBy(mycols).pivot("class").agg({"class": "count"})
pivot.show()

+--------------------+--------------------+--------------------+------------------+-----+---+----+
|                line|    sample_repo_name|         sample_path|avg_committer_time|class|  0|   1|
+--------------------+--------------------+--------------------+------------------+-----+---+----+
|     @OpenWirePro...|apache/activemq-o...|openwire-core/src...|        1448997392|    0|  1|null|
|       info.setVa...|apache/activemq-a...|tests/activemq5-u...|        1449435785|    0|  1|null|
|         copy.bro...|apache/activemq-o...|openwire-core/src...|        1448997392|    0|  1|null|
|         looseMar...|apache/activemq-o...|openwire-legacy/s...|        1447363682|    0|  1|null|
|/ etoile   etoile...|apache/activemq-o...|openwire-core/src...|        1448997392|    0|  1|null|
|     }    / etoil...|apache/activemq-o...|openwire-legacy/s...|        1447363682|    0|  1|null|
|          Connect...|apache/activemq-a...|tests/activemq5-u...|        1439216357|    0|  1|null|
|         

# Commits in apache java

In [15]:
folder="/home/herimanitra/Téléchargements/data-master/apachecommits*.csv.gz"
commit = spark.read.csv(folder, header=True, inferSchema=False)
commit.show(5)

+--------------------+--------------------+--------------------+--------------------+----------+
|             subject|              commit|        author_email|         author_date| repo_name|
+--------------------+--------------------+--------------------+--------------------+----------+
|TableRowIterator:...|6ca3779c9f82aed1e...|02ee1f888690a9586...|2007-03-28 14:49:...|apache/fop|
|Do not issue a wa...|71e26a23cca94642a...|02ee1f888690a9586...|2010-08-31 14:20:...|apache/fop|
|Ignore new FindBu...|786bb34ba120d158e...|02ee1f888690a9586...|2012-06-18 14:07:...|apache/fop|
|Fed up with all t...|efec6345486dcbde3...|02ee1f888690a9586...|2008-07-24 09:35:...|apache/fop|
|Simplification in...|7a41e0678ab61d5a2...|02ee1f888690a9586...|2007-12-18 16:48:...|apache/fop|
+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [9]:
commit.count()

799994

# All PR from Apache foundation in 2014->2016

In [10]:
folder="/home/herimanitra/Téléchargements/data/PR20*.csv.gz"
z = spark.read.csv(folder, header=True, inferSchema=False)
z.select(['type','repo_name','PRcreated_at','prtitle','prbody']).show(350)

+----------------+---------------+--------------------+--------------------+--------------------+
|            type|      repo_name|        PRcreated_at|             prtitle|              prbody|
+----------------+---------------+--------------------+--------------------+--------------------+
|PullRequestEvent|     apache/cxf|2016-12-23 15:44:...|[CXF-7183] Fix sy...|                null|
|PullRequestEvent|     apache/cxf|2016-09-07 08:29:...|[CXF-7043] JAX-RS...|See: https://issu...|
|PullRequestEvent|     apache/cxf|2016-09-08 14:26:...|[CXF-7045] Update...|https://issues po...|
|PullRequestEvent|     apache/orc|2016-11-30 19:56:...|Fix sample code f...|` point schema()`...|
|PullRequestEvent|     apache/orc|2016-02-23 06:16:...|ORC-10 point  Cor...|This patch:  * Cr...|
|PullRequestEvent|     apache/orc|2016-06-02 15:22:...|ORC-61 point  Upd...|        … pom point |
|PullRequestEvent|     apache/orc|2016-10-10 16:53:...|ORC-105 point  Fi...|Signed-off-by: Ow...|
|PullRequestEvent|  

In [11]:
z.printSchema()

root
 |-- type: string (nullable = true)
 |-- org_id: string (nullable = true)
 |-- repo_name: string (nullable = true)
 |-- repo_url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- PRcreated_at: string (nullable = true)
 |-- action: string (nullable = true)
 |-- prbody: string (nullable = true)
 |-- prtitle: string (nullable = true)



In [12]:
z.groupBy(['repo_name']).count().show()

+--------------------+-----+
|           repo_name|count|
+--------------------+-----+
|apache/incubator-...|  359|
|apache/couchdb-jq...|    2|
|apache/cordova-pl...|    4|
|apache/struts-exa...|    2|
|        apache/karaf|  307|
|     apache/brooklyn|   17|
| apache/commons-text|   41|
|apache/couchdb-co...|   54|
|apache/incubator-...|    2|
|apache/maven-doxi...|    1|
|         apache/solr|    2|
|apache/incubator-...|  312|
|apache/cordova-pl...|  142|
|  apache/maven-wagon|   22|
|     apache/airavata|  131|
|apache/cordova-pl...|   17|
|apache/cordova-ap...|    9|
|apache/cordova-pl...|    2|
|       apache/ignite| 1845|
|apache/brooklyn-dist|  140|
+--------------------+-----+
only showing top 20 rows



## convert spark column into python List

In [13]:
prdate=[i.PRcreated_at  for i in z.select('PRcreated_at').collect() if i.PRcreated_at is not None] 
len(prdate)

76992

## Pivot

In [14]:
from pyspark.sql.functions import first
mycol= ["type","org_id","repo_name","repo_url","created_at","PRcreated_at", "action","prbody","prtitle"]
pivot = z.where((col('repo_name')=='apache/ignite')).groupBy(mycol).pivot("PRcreated_at").agg({"PRcreated_at": "count"})
pivot.show()

+----------------+------+-------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+---------

In [15]:
pivot.count()

1845

### conversion Date

In [17]:
z1=z.withColumn("Date", (col("PRcreated_at").cast("date")))

In [18]:
z1.select(['PRcreated_at','Date']).show()

+--------------------+----------+
|        PRcreated_at|      Date|
+--------------------+----------+
|2016-12-23 15:44:...|2016-12-23|
|2016-09-07 08:29:...|2016-09-07|
|2016-09-08 14:26:...|2016-09-08|
|2016-11-30 19:56:...|2016-11-30|
|2016-02-23 06:16:...|2016-02-23|
|2016-06-02 15:22:...|2016-06-02|
|2016-10-10 16:53:...|2016-10-10|
|2016-03-28 18:12:...|2016-03-28|
|2016-08-11 19:32:...|2016-08-11|
|2016-03-15 23:16:...|2016-03-15|
|2016-09-29 21:37:...|2016-09-29|
|2016-04-06 22:31:...|2016-04-06|
|2016-05-06 21:41:...|2016-05-06|
|2016-08-13 03:56:...|2016-08-13|
|2016-12-22 21:01:...|2016-12-22|
|2016-07-15 21:41:...|2016-07-15|
|2016-05-09 15:51:...|2016-05-09|
|2016-10-26 17:00:...|2016-10-26|
|2016-03-12 16:23:...|2016-03-12|
|2016-10-10 16:50:...|2016-10-10|
+--------------------+----------+
only showing top 20 rows

