In [1]:
from pyspark import SparkContext, SparkConf
import pandas as pd

In [2]:
import pandas as pd
p_df = pd.DataFrame([("foo", 1), ("bar", 2), ("foo", 3)], columns=("k", "v"))
p_df['k'].unique()

array(['foo', 'bar'], dtype=object)

In [3]:
df_pandas = pd.read_csv("Dati/doc1-2015100810.csv")
df_pandas.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16


# Getting Started

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

### how to create a Dataframe in pyspark

In [5]:
s_df = spark.createDataFrame([("foo", 1), ("bar", 2), ("foo", 3)], ('k', 'v'))

### Read csv (as a dataframe) and show it

In [6]:
df = spark.read.csv("Dati/doc1-2015100810.csv", header = True)

In [7]:
df.show()

+--------+----------------+----------+------------+-----------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|   DATAUTILIZACAO|
+--------+----------------+----------+------------+-----------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|07/10/15 07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|07/10/15 07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:51|
|     000|    OPER S/LINHA|     08047|  0000771305|07/10/15 16:47:16|
|     000|    OPER S/LINHA|     08047|  0000856665|07/10/15 16:50:18|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002115218|07/10/15 13:18:26|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002178679|07/10/15 14:18:59|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:40:56|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:41:05|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002202962|07/10/15 16:48:12|
|     629|  ALTO BOQ

In [8]:
# spark, df are from the previous example
# Print the schema in a tree format
df.printSchema()

root
 |-- CODLINHA: string (nullable = true)
 |-- NOMELINHA: string (nullable = true)
 |-- CODVEICULO: string (nullable = true)
 |-- NUMEROCARTAO: string (nullable = true)
 |-- DATAUTILIZACAO: string (nullable = true)



In [9]:
# Select only the "codlinha" column
df.select("codlinha").show()

+--------+
|codlinha|
+--------+
|     280|
|     280|
|     280|
|     280|
|     000|
|     000|
|     629|
|     629|
|     629|
|     629|
|     629|
|     629|
|     814|
|     814|
|     814|
|     814|
|     629|
|     629|
|     653|
|     653|
+--------+
only showing top 20 rows



In [10]:
df.select(df['codlinha'], df['nomelinha']).show()

+--------+----------------+
|codlinha|       nomelinha|
+--------+----------------+
|     280|N. SRA.DE NAZARÉ|
|     280|N. SRA.DE NAZARÉ|
|     280|N. SRA.DE NAZARÉ|
|     280|N. SRA.DE NAZARÉ|
|     000|    OPER S/LINHA|
|     000|    OPER S/LINHA|
|     629|  ALTO BOQUEIRÃO|
|     629|  ALTO BOQUEIRÃO|
|     629|  ALTO BOQUEIRÃO|
|     629|  ALTO BOQUEIRÃO|
|     629|  ALTO BOQUEIRÃO|
|     629|  ALTO BOQUEIRÃO|
|     814|       MOSSUNGUÊ|
|     814|       MOSSUNGUÊ|
|     814|       MOSSUNGUÊ|
|     814|       MOSSUNGUÊ|
|     629|  ALTO BOQUEIRÃO|
|     629|  ALTO BOQUEIRÃO|
|     653|          SABARÁ|
|     653|          SABARÁ|
+--------+----------------+
only showing top 20 rows



In [11]:
df.filter(df['codlinha'] == "280").show()

+--------+----------------+----------+------------+-----------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|   DATAUTILIZACAO|
+--------+----------------+----------+------------+-----------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|07/10/15 07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|07/10/15 07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:51|
|     280|N. SRA.DE NAZARÉ|     BC947|  0001426341|07/10/15 17:55:07|
|     280|N. SRA.DE NAZARÉ|     BN602|  0001282115|07/10/15 06:29:09|
|     280|N. SRA.DE NAZARÉ|     BN602|  0002883432|07/10/15 15:19:38|
|     280|N. SRA.DE NAZARÉ|     BN602|  0002883432|07/10/15 15:19:43|
|     280|N. SRA.DE NAZARÉ|     BN602|  0003530435|07/10/15 19:22:40|
|     280|N. SRA.DE NAZARÉ|     BN602|  0002025345|07/10/15 22:58:12|
|     280|N. SRA.DE NAZARÉ|     BN602|  0003253117|07/10/15 16:44:16|
|     280|N. SRA.DE 

In [12]:
df.groupBy("codlinha").count().show()

+--------+-----+
|codlinha|count|
+--------+-----+
|     467|  608|
|     829| 1098|
|     870| 2228|
|     666|  664|
|     TXA|  711|
|     475| 1390|
|     718|  398|
|     030| 4757|
|     205| 1711|
|     169| 1071|
|     334|  374|
|     TSP|  904|
|     462| 1935|
|     711|  769|
|     272|  864|
|     470|  477|
|     232|  805|
|     635|  365|
|     714|  344|
|     ARA| 8965|
+--------+-----+
only showing top 20 rows



In [13]:
df.groupBy("codlinha").count().orderBy('count', ascending=False).show()

+--------+------+
|codlinha| count|
+--------+------+
|     000|229253|
|     OPC| 36660|
|     ARA|  8965|
|     040|  7288|
|     021|  6109|
|     020|  4839|
|     216|  4808|
|     030|  4757|
|     050|  4420|
|     541|  3083|
|     650|  2957|
|     703|  2906|
|     TPH|  2659|
|     860|  2566|
|     338|  2525|
|     684|  2507|
|     658|  2448|
|     545|  2232|
|     876|  2231|
|     870|  2228|
+--------+------+
only showing top 20 rows



### Running SQL Queries Programmatically

In [14]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+--------+----------------+----------+------------+-----------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|   DATAUTILIZACAO|
+--------+----------------+----------+------------+-----------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|07/10/15 07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|07/10/15 07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:51|
|     000|    OPER S/LINHA|     08047|  0000771305|07/10/15 16:47:16|
|     000|    OPER S/LINHA|     08047|  0000856665|07/10/15 16:50:18|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002115218|07/10/15 13:18:26|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002178679|07/10/15 14:18:59|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:40:56|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:41:05|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002202962|07/10/15 16:48:12|
|     629|  ALTO BOQ

## SQL Timestamp

In [15]:
from pyspark.sql.functions import col, unix_timestamp, to_date, to_timestamp

df_timestamp = df.withColumn('datautilizacao', to_timestamp(unix_timestamp(col('datautilizacao'), 'dd/MM/yy HH:mm:ss').cast("timestamp")))

df_timestamp.show()

df_timestamp.createOrReplaceTempView("people_timestamp")

sqlDF = spark.sql("SELECT * FROM people_timestamp WHERE datautilizacao == '2015-10-07 16:47:16'")
sqlDF.show()

+--------+----------------+----------+------------+-------------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|     datautilizacao|
+--------+----------------+----------+------------+-------------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|2015-10-07 07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|2015-10-07 07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|2015-10-07 18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|2015-10-07 18:49:51|
|     000|    OPER S/LINHA|     08047|  0000771305|2015-10-07 16:47:16|
|     000|    OPER S/LINHA|     08047|  0000856665|2015-10-07 16:50:18|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002115218|2015-10-07 13:18:26|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002178679|2015-10-07 14:18:59|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|2015-10-07 14:40:56|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|2015-10-07 14:41:05|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002202962|2015-10-07 16

### Global Temporary View

Temporary views in Spark SQL are session-scoped and will disappear if the session that creates it terminates. If you want to have a temporary view that is shared among all sessions and keep alive until the Spark application terminates, you can create a global temporary view. Global temporary view is tied to a system preserved database global_temp, and we must use the qualified name to refer it, e.g. SELECT * FROM global_temp.view1.

In [16]:
# Register the DataFrame as a global temporary view
df.createGlobalTempView("people")

# Global temporary view is tied to a system preserved database `global_temp`
spark.sql("SELECT * FROM global_temp.people").show()

+--------+----------------+----------+------------+-----------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|   DATAUTILIZACAO|
+--------+----------------+----------+------------+-----------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|07/10/15 07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|07/10/15 07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:51|
|     000|    OPER S/LINHA|     08047|  0000771305|07/10/15 16:47:16|
|     000|    OPER S/LINHA|     08047|  0000856665|07/10/15 16:50:18|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002115218|07/10/15 13:18:26|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002178679|07/10/15 14:18:59|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:40:56|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:41:05|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002202962|07/10/15 16:48:12|
|     629|  ALTO BOQ

In [17]:
# Global temporary view is cross-session
spark.newSession().sql("SELECT * FROM global_temp.people").show()

+--------+----------------+----------+------------+-----------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|   DATAUTILIZACAO|
+--------+----------------+----------+------------+-----------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|07/10/15 07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|07/10/15 07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:51|
|     000|    OPER S/LINHA|     08047|  0000771305|07/10/15 16:47:16|
|     000|    OPER S/LINHA|     08047|  0000856665|07/10/15 16:50:18|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002115218|07/10/15 13:18:26|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002178679|07/10/15 14:18:59|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:40:56|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:41:05|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002202962|07/10/15 16:48:12|
|     629|  ALTO BOQ

## Interoperating with RDDs
Spark SQL supports two different methods for converting existing RDDs into Datasets. The first method uses reflection to infer the schema of an RDD that contains specific types of objects. This reflection based approach leads to more concise code and works well when you already know the schema while writing your Spark application.

The second method for creating Datasets is through a programmatic interface that allows you to construct a schema and then apply it to an existing RDD. While this method is more verbose, it allows you to construct Datasets when the columns and their types are not known until runtime.

### Inferring the Schema Using Reflection (DOESN'T WORK AND I DON'T KNOW WHY, BUT THIS IS NOT IMPORTANT)

## TEST WITH THE LAST DATETIME COLUMN DIVIDED

I divided the last column in two (one for date and other for time)

In [18]:
df_test = spark.read.csv("Dati/test.csv", header = True)
df_test.show()
df_test.printSchema()

+--------+----------------+----------+------------+--------------+-------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|DATAUTILIZACAO|ORAUTILIZACAO|
+--------+----------------+----------+------------+--------------+-------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|      07/10/15|     07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|      07/10/15|     07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|      07/10/15|     18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|      07/10/15|     18:49:51|
|     000|    OPER S/LINHA|     08047|  0000771305|      07/10/15|     16:47:16|
|     000|    OPER S/LINHA|     08047|  0000856665|      07/10/15|     16:50:18|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002115218|      07/10/15|     13:18:26|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002178679|      07/10/15|     14:18:59|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|      07/10/15|     14:40:56|
|     629|  ALTO BOQUEIRÃO| 

## Convert column String to Datetime into the Dataframe

In [19]:


df_datetime = df_test.withColumn('datautilizacao', to_date(unix_timestamp(col('datautilizacao'), 'dd/MM/yy').cast("timestamp")))

df_datetime.show()
df_datetime.printSchema()

+--------+----------------+----------+------------+--------------+-------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|datautilizacao|ORAUTILIZACAO|
+--------+----------------+----------+------------+--------------+-------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|    2015-10-07|     07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|    2015-10-07|     07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|    2015-10-07|     18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|    2015-10-07|     18:49:51|
|     000|    OPER S/LINHA|     08047|  0000771305|    2015-10-07|     16:47:16|
|     000|    OPER S/LINHA|     08047|  0000856665|    2015-10-07|     16:50:18|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002115218|    2015-10-07|     13:18:26|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002178679|    2015-10-07|     14:18:59|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|    2015-10-07|     14:40:56|
|     629|  ALTO BOQUEIRÃO| 

## Convert String to Datetime (but the result is another column)

In [20]:
from pyspark.sql.types import *

#df_test = df_test.withColumn("datautilizacao", df_test["datautilizacao"].cast(StringType()))

from pyspark.sql.functions import to_date

df_test3 = df_test.select(to_date('datautilizacao', 'dd/MM/yy').alias('datautilizacao'))
#df_test.select(to_date(df_test.datautilizacao, 'dd/MM/yy').alias('datautilizacao'))

df_test3.show()
df_test3.printSchema()

+--------------+
|datautilizacao|
+--------------+
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
|    2015-10-07|
+--------------+
only showing top 20 rows

root
 |-- datautilizacao: date (nullable = true)



## SQL with Datatime

In [21]:
df_datetime.createOrReplaceTempView("people_test")

sqlDF = spark.sql("SELECT codlinha FROM people_test WHERE datautilizacao == '2015-10-07'")
sqlDF.show()

df_datetime.show()

+--------+
|codlinha|
+--------+
|     280|
|     280|
|     280|
|     280|
|     000|
|     000|
|     629|
|     629|
|     629|
|     629|
|     629|
|     629|
|     814|
|     814|
|     814|
|     814|
|     629|
|     629|
|     653|
|     653|
+--------+
only showing top 20 rows

+--------+----------------+----------+------------+--------------+-------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|datautilizacao|ORAUTILIZACAO|
+--------+----------------+----------+------------+--------------+-------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|    2015-10-07|     07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|    2015-10-07|     07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|    2015-10-07|     18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|    2015-10-07|     18:49:51|
|     000|    OPER S/LINHA|     08047|  0000771305|    2015-10-07|     16:47:16|
|     000|    OPER S/LINHA|     08047|  0000856665|    2015-10

## Distinct values multi-column

In [22]:
df.show()

+--------+----------------+----------+------------+-----------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|   DATAUTILIZACAO|
+--------+----------------+----------+------------+-----------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|07/10/15 07:37:02|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|07/10/15 07:51:25|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:49|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:51|
|     000|    OPER S/LINHA|     08047|  0000771305|07/10/15 16:47:16|
|     000|    OPER S/LINHA|     08047|  0000856665|07/10/15 16:50:18|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002115218|07/10/15 13:18:26|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002178679|07/10/15 14:18:59|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:40:56|
|     629|  ALTO BOQUEIRÃO|     KA603|  0000849493|07/10/15 14:41:05|
|     629|  ALTO BOQUEIRÃO|     KA603|  0002202962|07/10/15 16:48:12|
|     629|  ALTO BOQ

In [23]:
df_distinct = df.dropDuplicates(['CODLINHA', 'NOMELINHA'])
df_distinct.show()

+--------+----------------+----------+------------+-----------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|   DATAUTILIZACAO|
+--------+----------------+----------+------------+-----------------+
|     180|AVERDE/ABRANCHES|     HC302|  0010064208|07/10/15 08:07:59|
|     271|     LARANJEIRAS|     BC944|  0002801400|07/10/15 05:58:41|
|     331|        MERCÚRIO|     DA850|  0003471943|07/10/15 06:47:17|
|     673|         FORMOSA|     HN610|  0000378712|07/10/15 07:31:49|
|     184|        V. SUIÇA|     BN621|  0003154177|07/10/15 06:40:48|
|     212|           SOLAR|     BN406|  0002208699|07/10/15 06:22:11|
|     650|S.RITA/PINHEIRIN|     HA287|  0002345121|07/10/15 14:53:32|
|     674|  N. SRA. DA LUZ|     LC022|  0002288037|07/10/15 06:58:16|
|     680|         RURBANA|     HA299|  0003559777|07/10/15 13:30:54|
|     789|      MAD. CAIUÁ|     JC001|  0002297351|07/10/15 06:41:14|
|     913|    BUTIATUVINHA|     MN401|  0003204194|07/10/15 06:43:53|
|     182|       ABR

In [24]:
df_distinct.groupBy("codlinha").count().orderBy('count', ascending=False).show()

+--------+-----+
|codlinha|count|
+--------+-----+
|     829|    1|
|     467|    1|
|     TXA|    1|
|     870|    1|
|     666|    1|
|     232|    1|
|     635|    1|
|     714|    1|
|     475|    1|
|     718|    1|
|     462|    1|
|     470|    1|
|     272|    1|
|     711|    1|
|     ARA|    1|
|     521|    1|
|     625|    1|
|     169|    1|
|     030|    1|
|     TSP|    1|
+--------+-----+
only showing top 20 rows



In [25]:
df.count()

512013

In [26]:
df_distinct.count()

257

In [27]:
df_distinct.select(df_distinct["codlinha"], df_distinct["nomelinha"]).show()

+--------+----------------+
|codlinha|       nomelinha|
+--------+----------------+
|     180|AVERDE/ABRANCHES|
|     271|     LARANJEIRAS|
|     673|         FORMOSA|
|     331|        MERCÚRIO|
|     650|S.RITA/PINHEIRIN|
|     674|  N. SRA. DA LUZ|
|     680|         RURBANA|
|     184|        V. SUIÇA|
|     212|           SOLAR|
|     789|      MAD. CAIUÁ|
|     913|    BUTIATUVINHA|
|     182|       ABRANCHES|
|     X24|REFORÇO SAMBAQUI|
|     614|  FAZENDINHA/PUC|
|     464|A.MUN/J.BOTÂNICO|
|     720|FAZEN/C.COMPRIDO|
|     260|M.HERMES/STA.EFI|
|     060| INTERBAIRROS VI|
|     393|C.IMB./P.BARIGUI|
|     541|   BAIRRO NOVO A|
+--------+----------------+
only showing top 20 rows



In [28]:
df_distinct.select(df_distinct["codlinha"], df_distinct["nomelinha"]).count()

257

## Write CSV with dictionary of codlinha-nomelinha

In [29]:
dictionary = df_distinct.select(df_distinct["codlinha"], df_distinct["nomelinha"])
###dictionary.write.csv('dictionary.csv')  NON USARE
#dictionary.toPandas().to_csv('dictionary.csv', index=False)

## Pyspark Profiling

In [30]:
import spark_df_profiling

#report = spark_df_profiling.ProfileReport(df)


In [31]:
#report.to_file("testing.html")

## Diagnosis Dataframe

In [32]:
summary_df = df.describe('CODLINHA','NOMELINHA','CODVEICULO','NUMEROCARTAO','DATAUTILIZACAO')
summary_df.show()

+-------+-----------------+----------------+-----------------+------------------+-----------------+
|summary|         CODLINHA|       NOMELINHA|       CODVEICULO|      NUMEROCARTAO|   DATAUTILIZACAO|
+-------+-----------------+----------------+-----------------+------------------+-----------------+
|  count|           512013|          512013|           512013|            512013|           512013|
|   mean|235.2516363444996|            null|4333.230022916709|2776320.8005109243|             null|
| stddev|304.6590005314995|            null| 3040.95734919013|1164166.3394630125|             null|
|    min|              000|A.MUN/J.BOTÂNICO|            00001|        0000116745|07/10/15 00:01:24|
|    max|              Z03|       ZOOLÓGICO|            MR106|        0010066452|08/10/15 00:58:34|
+-------+-----------------+----------------+-----------------+------------------+-----------------+



In [33]:
summary_df = df.summary()
summary_df.show()

+-------+-----------------+----------------+-----------------+------------------+-----------------+
|summary|         CODLINHA|       NOMELINHA|       CODVEICULO|      NUMEROCARTAO|   DATAUTILIZACAO|
+-------+-----------------+----------------+-----------------+------------------+-----------------+
|  count|           512013|          512013|           512013|            512013|           512013|
|   mean|235.2516363444996|            null|4333.230022916709|2776320.8005109243|             null|
| stddev|304.6590005314995|            null| 3040.95734919013|1164166.3394630125|             null|
|    min|              000|A.MUN/J.BOTÂNICO|            00001|        0000116745|07/10/15 00:01:24|
|    25%|              0.0|            null|           2083.0|         2237296.0|             null|
|    50%|              0.0|            null|           3068.0|         2868707.0|             null|
|    75%|            532.0|            null|           7020.0|         3353338.0|             null|


In [34]:
df.dropna().count()
#se uguale a 512013 allora non ci sono missing values

512013

In [35]:
df.groupby('codlinha').agg({'codlinha': 'max'}).show(5)
#al posto di 'max' ci può andare un espressione come 'avg', 'min' o anche altre, ma non ne trovo altre

+--------+-------------+
|codlinha|max(codlinha)|
+--------+-------------+
|     467|          467|
|     829|          829|
|     666|          666|
|     870|          870|
|     TXA|          TXA|
+--------+-------------+
only showing top 5 rows



## Adding a new column

In [36]:
df.withColumn('Purchase_new', col('codlinha')).select('codlinha','Purchase_new').show(5)

+--------+------------+
|codlinha|Purchase_new|
+--------+------------+
|     280|         280|
|     280|         280|
|     280|         280|
|     280|         280|
|     000|         000|
+--------+------------+
only showing top 5 rows



In [37]:
df_new_column=df.withColumn('Purchase_new', col('codlinha'))
df_new_column.show(5)

+--------+----------------+----------+------------+-----------------+------------+
|CODLINHA|       NOMELINHA|CODVEICULO|NUMEROCARTAO|   DATAUTILIZACAO|Purchase_new|
+--------+----------------+----------+------------+-----------------+------------+
|     280|N. SRA.DE NAZARÉ|     BC911|  0001430250|07/10/15 07:37:02|         280|
|     280|N. SRA.DE NAZARÉ|     BC911|  0002470195|07/10/15 07:51:25|         280|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:49|         280|
|     280|N. SRA.DE NAZARÉ|     BC911|  0003234514|07/10/15 18:49:51|         280|
|     000|    OPER S/LINHA|     08047|  0000771305|07/10/15 16:47:16|         000|
+--------+----------------+----------+------------+-----------------+------------+
only showing top 5 rows



## Non-null values for row (Better with pandas)

In [38]:
#Pandas
#df_pandas.apply(lambda x: x.count(), axis=1)
#RITORNA IL NUMERO DI VALORI NON NULLI DI OGNI RIGA

In [39]:
#%timeit df_pandas.apply(lambda x: x.count(), axis=1)
#RISULTATO:14.4 s ± 286 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [40]:
df_pandas.isnull().sum(axis=1)

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        0
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
511983    0
511984    0
511985    0
511986    0
511987    0
511988    0
511989    0
511990    0
511991    0
511992    0
511993    0
511994    0
511995    0
511996    0
511997    0
511998    0
511999    0
512000    0
512001    0
512002    0
512003    0
512004    0
512005    0
512006    0
512007    0
512008    0
512009    0
512010    0
512011    0
512012    0
Length: 512013, dtype: int64

In [41]:
%timeit df_pandas.isnull().sum(axis=1)

85 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Completeness with pandas (Raw method)

In [42]:
import numpy as np

test_pd_df = df_pandas
test_pd_df['null-values'] = test_pd_df.isnull().sum(axis=1)


conditions = [
    test_pd_df['null-values'] == 0,
    test_pd_df['null-values'] == 1,
    test_pd_df['null-values'] == 2,
    test_pd_df['null-values'] == 3,
    test_pd_df['null-values'] == 4]
choices = [100, 80, 60, 40, 20]
test_pd_df['completeness'] = np.select(conditions, choices, default=0)

test_pd_df.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02,0,100
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25,0,100
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49,0,100
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51,0,100
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16,0,100


## Null values for column with pandas

In [43]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512013 entries, 0 to 512012
Data columns (total 7 columns):
CODLINHA          512013 non-null object
NOMELINHA         512013 non-null object
CODVEICULO        512013 non-null object
NUMEROCARTAO      512013 non-null int64
DATAUTILIZACAO    512013 non-null object
null-values       512013 non-null int64
completeness      512013 non-null int64
dtypes: int64(3), object(4)
memory usage: 27.3+ MB


## Null values for column with pyspark

In [44]:
import pyspark.sql.functions as F

df_agg = df.agg(*[F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns])

df_agg.show()

+--------+---------+----------+------------+--------------+
|CODLINHA|NOMELINHA|CODVEICULO|NUMEROCARTAO|DATAUTILIZACAO|
+--------+---------+----------+------------+--------------+
|       0|        0|         0|           0|             0|
+--------+---------+----------+------------+--------------+



In [45]:
from functools import reduce
df_agg_col = reduce(
    lambda a, b: a.union(b),
    (
        df_agg.select(F.lit(c).alias("Column_Name"), F.col(c).alias("NULL_Count")) 
        for c in df_agg.columns
    )
)
df_agg_col.show()

+--------------+----------+
|   Column_Name|NULL_Count|
+--------------+----------+
|      CODLINHA|         0|
|     NOMELINHA|         0|
|    CODVEICULO|         0|
|  NUMEROCARTAO|         0|
|DATAUTILIZACAO|         0|
+--------------+----------+



## Count Null, NaN and empty of a column

In [46]:
from pyspark.sql.functions import isnan

df.filter((df["codlinha"] == "") | df["codlinha"].isNull() | isnan(df["codlinha"])).count()

0

# Consistency

In [47]:
test_dict = df_pandas
dict_pandas = pd.read_csv("dictionary.csv")
test_dict.info()
dict_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512013 entries, 0 to 512012
Data columns (total 7 columns):
CODLINHA          512013 non-null object
NOMELINHA         512013 non-null object
CODVEICULO        512013 non-null object
NUMEROCARTAO      512013 non-null int64
DATAUTILIZACAO    512013 non-null object
null-values       512013 non-null int64
completeness      512013 non-null int64
dtypes: int64(3), object(4)
memory usage: 27.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257 entries, 0 to 256
Data columns (total 2 columns):
codlinha     257 non-null object
nomelinha    257 non-null object
dtypes: object(2)
memory usage: 4.1+ KB


In [48]:
dict_pandas2 = pd.read_csv("dictionary.csv", index_col="codlinha")
row_dict = dict_pandas2.loc["180"] 
row_dict["nomelinha"]

'AVERDE/ABRANCHES'

In [49]:
#Ci mette un pochino, ma tanto va fatta una volta sola
for i,row in test_dict.iterrows():
    row_dict = dict_pandas2.loc[row["CODLINHA"]]
    if row_dict["nomelinha"] == row["NOMELINHA"]:
        test_dict.at[i, 'DICTIONARY'] = 1
        
test_dict.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness,DICTIONARY
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02,0,100,1.0
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25,0,100,1.0
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49,0,100,1.0
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51,0,100,1.0
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16,0,100,1.0


In [50]:
#subsetDataFrame = dfObj[dfObj['Product'].isin(['Mangos', 'Grapes']) ]

In [51]:
#a = test_dict.apply(lambda y: 1 if ((y['CODLINHA'] == dict_pandas['codlinha']),(y['NOMELINHA'] == dict_pandas['nomelinha'])) else 0, axis=1)
#a


#def check_dict(x):
#    batch_df = df_pandas.apply(lambda y: 1 if (y['codlinha'] == x['codlinha'] and y['nomelinha'] == x['nomelinha']) else 0, axis=1)
#    return batch_df

#test_dict['dictionary'] = dict_pandas.apply(check_dict, axis=1)
#test_dict

## Faccio lista veicoli

In [52]:
df_distinct = df.dropDuplicates(["CODVEICULO"])
df_distinct.groupBy("CODVEICULO").count().orderBy('count', ascending=False).show()
list_vehicle = df_distinct.select(df_distinct["CODVEICULO"])
list_vehicle.toPandas().to_csv('list_vehicle.csv', index=False)

+----------+-----+
|CODVEICULO|count|
+----------+-----+
|     16S45|    1|
|     JC316|    1|
|     GR404|    1|
|     EA179|    1|
|     HA602|    1|
|     AC282|    1|
|     06065|    1|
|     GA143|    1|
|     GA165|    1|
|     GN600|    1|
|     AC019|    1|
|     05049|    1|
|     LA009|    1|
|     HA283|    1|
|     09028|    1|
|     08023|    1|
|     BC948|    1|
|     GC001|    1|
|     AC003|    1|
|     HA924|    1|
+----------+-----+
only showing top 20 rows



In [53]:
df_distinct.count()

1422

In [54]:
test_df = df_pandas
#vehicle_pandas = pd.read_csv("list_vehicle.csv", index_col="CODVEICULO")
vehicle_pandas = pd.read_csv("list_vehicle.csv")
test_df.info()
vehicle_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512013 entries, 0 to 512012
Data columns (total 8 columns):
CODLINHA          512013 non-null object
NOMELINHA         512013 non-null object
CODVEICULO        512013 non-null object
NUMEROCARTAO      512013 non-null int64
DATAUTILIZACAO    512013 non-null object
null-values       512013 non-null int64
completeness      512013 non-null int64
DICTIONARY        512013 non-null float64
dtypes: float64(1), int64(3), object(4)
memory usage: 31.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1422 entries, 0 to 1421
Data columns (total 1 columns):
CODVEICULO    1422 non-null object
dtypes: object(1)
memory usage: 11.2+ KB


In [55]:
vehicle_pandas.head(5)

Unnamed: 0,CODVEICULO
0,HA241
1,JC604
2,EC303
3,MN404
4,AB304


In [56]:
test_df["CHECKVEICULO"] = test_df["CODVEICULO"].isin(vehicle_pandas["CODVEICULO"])
test_df.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness,DICTIONARY,CHECKVEICULO
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02,0,100,1.0,True
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25,0,100,1.0,True
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49,0,100,1.0,True
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51,0,100,1.0,True
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16,0,100,1.0,True


In [57]:
for i,row in test_df.iterrows():
    if row["CHECKVEICULO"] == True:
        test_df.at[i, 'CHECKVEHICLE'] = 1
    else:
        test_df.at[i, 'CHECKVEHICLE'] = 0
        
test_df.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness,DICTIONARY,CHECKVEICULO,CHECKVEHICLE
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02,0,100,1.0,True,1.0
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25,0,100,1.0,True,1.0
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49,0,100,1.0,True,1.0
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51,0,100,1.0,True,1.0
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16,0,100,1.0,True,1.0


In [58]:
test_df=test_df.drop(['CHECKVEICULO'], axis=1)
test_df.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness,DICTIONARY,CHECKVEHICLE
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02,0,100,1.0,1.0
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25,0,100,1.0,1.0
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49,0,100,1.0,1.0
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51,0,100,1.0,1.0
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16,0,100,1.0,1.0


In [59]:
#test_df["CHECKVEHICLE"].astype('int')
#test_df.head(5)

## Check numerocartao

In [60]:
for i,row in test_df.iterrows():
    #row["NUMEROCARTAO"].dtype == np.int64 
    if isinstance(row["NUMEROCARTAO"], int):
        test_df.at[i, 'CHECKCARTAO'] = 1
    else:
        test_df.at[i, 'CHECKCARTAO'] = 0
        
test_df.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness,DICTIONARY,CHECKVEHICLE,CHECKCARTAO
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02,0,100,1.0,1.0,1.0
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25,0,100,1.0,1.0,1.0
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49,0,100,1.0,1.0,1.0
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51,0,100,1.0,1.0,1.0
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16,0,100,1.0,1.0,1.0


## Check Datetime & Conformity

In [61]:
# senza UTC per togliere il fuso orario
#test_df["DATAUTILIZACAO"] = pd.to_datetime(test_df["DATAUTILIZACAO"], utc=True)
#test_df.head(5)

In [102]:
import re

def is_datetime(str_datetime):
    return re.search("^\d\d\d\d-(0?[1-9]|1[0-2])-(0?[1-9]|[12][0-9]|3[01]) (00|0?[0-9]|1[0-9]|2[0-3]):(0?[0-9]|[0-5][0-9]):(0?[0-9]|[0-5][0-9])\+00:00$", str_datetime)

def is_strdate(str_datetime): #conformity datetime
    return re.search("^(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[0-2])/\d\d (00|0?[0-9]|1[0-9]|2[0-3]):(0?[0-9]|[0-5][0-9]):(0?[0-9]|[0-5][0-9])$", str_datetime)
    
def is_sameday(str_datetime): #consistency datetime
    return re.search("^07/10/15 (00|0?[0-9]|1[0-9]|2[0-3]):(0?[0-9]|[0-5][0-9]):(0?[0-9]|[0-5][0-9])$", str_datetime)

def is_codlinha(str_datetime):
    return re.search("^(\d|[A-Z]){3}$", str_datetime)

def is_codveiculo(str_datetime):
    return re.search("^(\d\d|[A-Z][A-Z])(\d){3}$", str_datetime)

def is_codicecartao(str_datetime):
    return re.search("^(\d){1,10}$", str_datetime)
    
    
#check = is_datetime('2015-07-10 16:47:16+00:00')
#check = is_strdate('07/10/15 18:49:51')
#check = is_sameday('07/10/15 18:49:51')
#check = is_codlinha('BC8')
#check = is_codveiculo('AS602')
check = is_codicecartao('1430250')
check

<re.Match object; span=(0, 7), match='1430250'>

In [73]:
if check:
    a = 5
else:
    a = 3
    
a

5

## Check conformity datetime

In [64]:
for i,row in test_df.iterrows():
    #row["NUMEROCARTAO"].dtype == np.int64 
    check = is_strdate(row["DATAUTILIZACAO"])
    if check:
        test_df.at[i, 'CHECKDATE'] = 1
    else:
        test_df.at[i, 'CHECKDATE'] = 0
        
test_df.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness,DICTIONARY,CHECKVEHICLE,CHECKCARTAO,CHECKDATE
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02,0,100,1.0,1.0,1.0,1.0
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25,0,100,1.0,1.0,1.0,1.0
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49,0,100,1.0,1.0,1.0,1.0
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51,0,100,1.0,1.0,1.0,1.0
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16,0,100,1.0,1.0,1.0,1.0


## Consistency

In [65]:
for i,row in test_df.iterrows():
    test_df.at[i, 'TOT'] = row["DICTIONARY"] + row["CHECKVEHICLE"] + row["CHECKCARTAO"] + row["CHECKDATE"]
    
test_df.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness,DICTIONARY,CHECKVEHICLE,CHECKCARTAO,CHECKDATE,TOT
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02,0,100,1.0,1.0,1.0,1.0,4.0
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25,0,100,1.0,1.0,1.0,1.0,4.0
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49,0,100,1.0,1.0,1.0,1.0,4.0
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51,0,100,1.0,1.0,1.0,1.0,4.0
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16,0,100,1.0,1.0,1.0,1.0,4.0


In [67]:
conditions = [
    test_df['TOT'] == 4.0,
    test_df['TOT'] == 3.0,
    test_df['TOT'] == 2.0,
    test_df['TOT'] == 1.0]
choices = [100, 75, 50, 25]
test_df['consistency'] = np.select(conditions, choices, default=0)

test_df.head(5)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness,DICTIONARY,CHECKVEHICLE,CHECKCARTAO,CHECKDATE,TOT,consistency
0,280,N. SRA.DE NAZARÉ,BC911,1430250,07/10/15 07:37:02,0,100,1.0,1.0,1.0,1.0,4.0,100
1,280,N. SRA.DE NAZARÉ,BC911,2470195,07/10/15 07:51:25,0,100,1.0,1.0,1.0,1.0,4.0,100
2,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:49,0,100,1.0,1.0,1.0,1.0,4.0,100
3,280,N. SRA.DE NAZARÉ,BC911,3234514,07/10/15 18:49:51,0,100,1.0,1.0,1.0,1.0,4.0,100
4,0,OPER S/LINHA,08047,771305,07/10/15 16:47:16,0,100,1.0,1.0,1.0,1.0,4.0,100


## Dirty completeness

In [126]:
df2=test_df.copy()

df2 = df2.mask(np.random.choice([True, False], size=df2.shape, p=[.9,.1]))
df2.head(10)

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,DATAUTILIZACAO,null-values,completeness,DICTIONARY,CHECKVEHICLE,CHECKCARTAO,CHECKDATE,TOT,consistency
0,,N. SRA.DE NAZARÉ,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,0.0,,,,,,4.0,
3,,,,,,0.0,,,,,,,
4,,,,771305.0,,,,,,,,,
5,,,,,,,100.0,,,,,,
6,629.0,,,,,0.0,,,,,,,
7,,,,,,,,,,,,4.0,
8,,,,,,0.0,,1.0,,,,,
9,,ALTO BOQUEIRÃO,,,07/10/15 14:41:05,,,,,1.0,,,
