In [5]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import functions as F

sc = SparkContext.getOrCreate()
spark = SQLContext(sc)

customSchema = StructType([ \
    StructField("K", IntegerType(), True), \
    StructField("V", StringType(), True), \
    StructField("C", StringType(), True)])

# post_df = spark.read.csv("test.csv", inferSchema=True, header=True)
post_df = spark.read.csv("test.csv", schema=customSchema, header=True)
# post_df = post_df.withColumn("name", F.col("path_splitted").getItem(F.size("path_splitted")-1))
# post_df = post_df.withColumn("name1", os.path.basename(input_file_name()))
post_df = post_df.withColumn("input_file_name", F.input_file_name())
# post_df = post_df.withColumn("path_splitted", F.split("path","/"))
# post_df = post_df.withColumn("input_file_name", F.input_file_name().split("path","/"))


spark.udf.register("get_only_file_name", lambda full_path: full_path.split("/")[-1])
post_df = post_df.withColumn("basename", get_only_file_name(input_file_name()))



# df = spark.load(source="com.databricks.spark.csv", header='true', schema=customSchema, path='test.csv')


In [23]:
# post_df.show(2, truncate=True)
post_df.show(2)

+-----------------+---------+---------+------------+----------------+----------+----------+--------+--------------------+
|        post_text|post_like|post_view|post_comment|         post_id| timestamp|post_share|username|                 tag|
+-----------------+---------+---------+------------+----------------+----------+----------+--------+--------------------+
|this is test text|     1074|      326|           0|2716120005077509|1560837600|         0|   K14vn|cat person hat ha...|
|this is test text|       30|      408|           1|2716118345077675|1560836706|         0|   K14vn|person table comp...|
+-----------------+---------+---------+------------+----------------+----------+----------+--------+--------------------+
only showing top 2 rows



In [8]:
post_df.printSchema()

root
 |-- post_text: string (nullable = true)
 |-- post_like: integer (nullable = true)
 |-- post_view: integer (nullable = true)
 |-- post_comment: integer (nullable = true)
 |-- post_id: long (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- post_share: integer (nullable = true)
 |-- username: string (nullable = true)
 |-- tag: string (nullable = true)



In [9]:
post_df.columns

['post_text',
 'post_like',
 'post_view',
 'post_comment',
 'post_id',
 'timestamp',
 'post_share',
 'username',
 'tag']

In [10]:
len(post_df.columns)

9

In [11]:
post_df.count()

460

In [12]:
post_df.describe('tag').show()

+-------+--------------------+
|summary|                 tag|
+-------+--------------------+
|  count|                 433|
|   mean|                null|
| stddev|                null|
|    min|              animal|
|    max|tree tree person ...|
+-------+--------------------+



In [13]:
post_df.describe('post_like').show()

+-------+------------------+
|summary|         post_like|
+-------+------------------+
|  count|               460|
|   mean|1616.0934782608695|
| stddev|  4818.84173315485|
|    min|                 9|
|    max|             42187|
+-------+------------------+



In [14]:
post_df.describe('username').show()

+-------+--------+
|summary|username|
+-------+--------+
|  count|     460|
|   mean|    null|
| stddev|    null|
|    min|   K14vn|
|    max|   K14vn|
+-------+--------+



In [15]:
post_df.select('tag', 'post_like').show(5)

+--------------------+---------+
|                 tag|post_like|
+--------------------+---------+
|cat person hat ha...|     1074|
|person table comp...|       30|
|               house|     2087|
|table music river...|     3005|
|    cat person river|    17917|
+--------------------+---------+
only showing top 5 rows



In [16]:
post_df.select('tag', 'post_like').distinct().show()

+--------------------+---------+
|                 tag|post_like|
+--------------------+---------+
|tree paper car ta...|       16|
|                tree|      657|
|car music car tre...|     1658|
|person cat moto r...|      830|
|river furniture c...|       34|
|tree car hat tree...|      703|
|furniture house a...|     1360|
|dog animal car mo...|     2668|
|table furniture c...|      206|
|      music hat moto|      613|
|music person comp...|      127|
|music paper hat m...|      179|
|person tree river...|     2514|
|river car car cat...|      167|
|computer tree com...|      291|
|furniture hat pap...|      238|
|dog moto river mu...|       48|
|hat computer musi...|       27|
|music table furni...|     3632|
|car moto cat tree...|    14536|
+--------------------+---------+
only showing top 20 rows



In [17]:
post_df.select('tag').distinct().count()

408

In [18]:
post_df.crosstab('timestamp', 'post_like').show()

+-------------------+---+---+---+----+----+----+---+----+---+----+----+---+---+---+----+----+---+----+---+----+----+----+---+---+----+---+----+---+----+---+-----+----+----+---+----+-----+---+----+----+---+---+----+----+----+----+----+----+---+---+----+----+-----+----+---+----+----+---+---+---+---+----+----+---+----+---+----+----+---+----+---+-----+---+---+---+---+---+----+---+---+-----+----+---+----+---+----+----+----+---+-----+---+---+----+---+---+----+---+---+---+----+---+----+---+---+----+---+----+---+---+---+---+---+---+---+----+---+---+---+----+----+---+----+---+----+---+----+----+----+---+----+---+-----+---+---+---+-----+---+---+----+---+---+---+---+----+----+---+---+---+---+----+---+---+----+---+----+----+----+---+----+---+----+---+---+----+----+---+---+---+----+----+---+---+---+---+----+---+----+---+----+----+---+---+---+---+-----+---+---+----+---+---+---+----+----+----+---+---+---+---+---+---+-----+---+---+---+----+---+---+---+---+---+----+----+---+---+---+---+---+----+---+---

In [19]:
post_df.describe('username').show()

+-------+--------+
|summary|username|
+-------+--------+
|  count|     460|
|   mean|    null|
| stddev|    null|
|    min|   K14vn|
|    max|   K14vn|
+-------+--------+



In [20]:
post_df.filter(post_df.post_like > 150).count()

360