In [2]:
import pyspark as ps

spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("df lecture") \
            .getOrCreate()

In [3]:
# read patent CSV
df = spark.read.csv('data/patent_class_citation.csv',
                         header=True,       # use headers or not
#                         quote='"',         # char for quotes
                         sep=",",           # char for separation
                         inferSchema=True)

In [5]:
# prints the schema
df.printSchema()


# show the table in a oh-so-nice format
df.show(n=20)

root
 |-- _c0: integer (nullable = true)
 |-- Patent: integer (nullable = true)
 |-- primary_class: double (nullable = true)
 |-- primary_subclass: string (nullable = true)
 |-- Citation: string (nullable = true)
 |-- GDate: timestamp (nullable = true)
 |-- GYear: integer (nullable = true)

+---+-------+-------------+----------------+--------+--------------------+-----+
|_c0| Patent|primary_class|primary_subclass|Citation|               GDate|GYear|
+---+-------+-------------+----------------+--------+--------------------+-----+
|  0|3930270|        360.0|          130.24| 3778560|1975-12-30 00:00:...| 1975|
|  1|3930270|        360.0|          130.24| 3840895|1975-12-30 00:00:...| 1975|
|  2|3930269|        360.0|            96.3| 3448940|1975-12-30 00:00:...| 1975|
|  3|3930269|        360.0|            96.3| 3488058|1975-12-30 00:00:...| 1975|
|  4|3930269|        360.0|            96.3| 3533633|1975-12-30 00:00:...| 1975|
|  5|3930269|        360.0|            96.3| 3610553|1975-12

In [11]:
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col


In [25]:
df_grouped_class = df.groupBy(col("primary_class")).agg(F.count(col("Citation")))
df_grouped_class.show(n=403)

+-------------+---------------+
|primary_class|count(Citation)|
+-------------+---------------+
|        558.0|          18206|
|        305.0|           5953|
|        299.0|          14751|
|        184.0|          10627|
|        147.0|             28|
|        720.0|          10500|
|        169.0|           9428|
|        160.0|          30287|
|        379.0|         124857|
|          8.0|          39609|
|         70.0|          52540|
|        702.0|          82522|
|        168.0|           1364|
|        524.0|         158417|
|        206.0|         144533|
|         69.0|            592|
|        365.0|         122859|
|          7.0|           4534|
|        249.0|          13254|
|        401.0|          23325|
|        142.0|            357|
|        191.0|           3027|
|        329.0|           3166|
|        112.0|          29745|
|        451.0|          79148|
|        708.0|          38192|
|        521.0|          48740|
|        232.0|           5310|
|       

In [29]:
#figure out which class has most patents 
largest_class = df_grouped_class.agg({"count(Citation)": "max"}).collect()[0]
largest_class["max(count(Citation))"]
df_grouped_class[df_grouped_class['count(Citation)'] == largest_class["max(count(Citation))"]].collect()

422076

In [34]:
citations = df[['Patent', 'Citation']]
citations.show(10)

+-------+--------+
| Patent|Citation|
+-------+--------+
|3930270| 3778560|
|3930270| 3840895|
|3930269| 3448940|
|3930269| 3488058|
|3930269| 3533633|
|3930269| 3610553|
|3930269| 3620479|
|3930269| 3664672|
|3930269| 3667701|
|3930269| 3670992|
+-------+--------+
only showing top 10 rows



In [2]:
from graphframes import *

ImportError: No module named pygraphframes