In [1]:
import pyspark as ps

spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("df lecture") \
            .getOrCreate()

In [2]:
# read patent CSV
df = spark.read.csv('data/patent_class_citation.csv',
                         header=True,       # use headers or not
#                         quote='"',         # char for quotes
                         sep=",",           # char for separation
                         inferSchema=True)

In [3]:
# prints the schema
df.printSchema()


# show the table in a oh-so-nice format
df.show(n=20)

root
 |-- _c0: integer (nullable = true)
 |-- Patent: integer (nullable = true)
 |-- primary_class: double (nullable = true)
 |-- primary_subclass: string (nullable = true)
 |-- Citation: string (nullable = true)
 |-- GDate: timestamp (nullable = true)
 |-- GYear: integer (nullable = true)

+---+-------+-------------+----------------+--------+--------------------+-----+
|_c0| Patent|primary_class|primary_subclass|Citation|               GDate|GYear|
+---+-------+-------------+----------------+--------+--------------------+-----+
|  0|3930270|        360.0|          130.24| 3778560|1975-12-30 00:00:...| 1975|
|  1|3930270|        360.0|          130.24| 3840895|1975-12-30 00:00:...| 1975|
|  2|3930269|        360.0|            96.3| 3448940|1975-12-30 00:00:...| 1975|
|  3|3930269|        360.0|            96.3| 3488058|1975-12-30 00:00:...| 1975|
|  4|3930269|        360.0|            96.3| 3533633|1975-12-30 00:00:...| 1975|
|  5|3930269|        360.0|            96.3| 3610553|1975-12

In [4]:
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col


In [5]:
df_grouped_class = df.groupBy(col("primary_class")).agg(F.count(col("Citation")))
df_grouped_class.show(n=403)

+-------------+---------------+
|primary_class|count(Citation)|
+-------------+---------------+
|        558.0|          18206|
|        305.0|           5953|
|        299.0|          14751|
|        184.0|          10627|
|        147.0|             28|
|        720.0|          10500|
|        169.0|           9428|
|        160.0|          30287|
|        379.0|         124857|
|          8.0|          39609|
|         70.0|          52540|
|        702.0|          82522|
|        168.0|           1364|
|        524.0|         158417|
|        206.0|         144533|
|         69.0|            592|
|        365.0|         122859|
|          7.0|           4534|
|        249.0|          13254|
|        401.0|          23325|
|        142.0|            357|
|        191.0|           3027|
|        329.0|           3166|
|        112.0|          29745|
|        451.0|          79148|
|        708.0|          38192|
|        521.0|          48740|
|        232.0|           5310|
|       

In [6]:
#figure out which class has most patents 
largest_class = df_grouped_class.agg({"count(Citation)": "max"}).collect()[0]
largest_class["max(count(Citation))"]
df_grouped_class[df_grouped_class['count(Citation)'] == largest_class["max(count(Citation))"]].collect()

[Row(primary_class=428.0, count(Citation)=422076)]

In [7]:
citations = df[['Patent', 'Citation']]
citations.show(10)

+-------+--------+
| Patent|Citation|
+-------+--------+
|3930270| 3778560|
|3930270| 3840895|
|3930269| 3448940|
|3930269| 3488058|
|3930269| 3533633|
|3930269| 3610553|
|3930269| 3620479|
|3930269| 3664672|
|3930269| 3667701|
|3930269| 3670992|
+-------+--------+
only showing top 10 rows



In [10]:
citations.write.csv('citations_clean.csv')

In [None]:
citations.repartition(1).write.csv('citations_clean_single.csv')

In [2]:
import pandas as pd

In [3]:
citations = pd.read_csv('citations_clean_single.csv/part-00000-d2192292-c6b6-4fc1-a573-e26f33477eba.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
citations.columns = ["FromNodeId", "ToNodeId"]

In [17]:
citations.head()

Unnamed: 0,FromNodeId,ToNodeId
0,3930270,3840895
1,3930269,3448940
2,3930269,3488058
3,3930269,3533633
4,3930269,3610553


In [18]:
citations.head('~/citations.txt', sep='\t', index=False)

TypeError: head() got an unexpected keyword argument 'sep'

In [21]:
citations.to_csv('~/citations.txt', sep='\t', index=False, header=False)

In [15]:
!head /Users/zoerichards/citations.txt

	FromNodeId	ToNodeId
0	3930270	3840895
1	3930269	3448940
2	3930269	3488058
3	3930269	3533633
4	3930269	3610553
5	3930269	3620479
6	3930269	3664672
7	3930269	3667701
8	3930269	3670992


In [22]:
!head /Users/zoerichards/citations.txt

3930270	3840895
3930269	3448940
3930269	3488058
3930269	3533633
3930269	3610553
3930269	3620479
3930269	3664672
3930269	3667701
3930269	3670992
3930269	3695549


In [23]:
citations.head()

Unnamed: 0,FromNodeId,ToNodeId
0,3930270,3840895
1,3930269,3448940
2,3930269,3488058
3,3930269,3533633
4,3930269,3610553


In [27]:
import re
def is_utility(patent_num):
    '''Returns 1 if patent_num string is formatted
    consistent with reissue, plant, design patent or X, H or T doc type'''
    if type(patent_num) != str:
        return 1
    return re.findall('RE|PP|D|X|H|T', patent_num) == []

In [33]:
citations['Utility'] = citations['ToNodeId'].apply(is_utility)
citations_utility = citations[citations.Utility == 1]

In [38]:
citations_utility.head()

Unnamed: 0,FromNodeId,ToNodeId,Utility
0,3930270,3840895,1
1,3930269,3448940,1
2,3930269,3488058,1
3,3930269,3533633,1
4,3930269,3610553,1


In [39]:
citations.shape, citations_utility.shape

((21002157, 3), (21001508, 3))

In [40]:
citations_utility = citations_utility[['FromNodeId', 'ToNodeId']]

In [41]:
citations_utility.to_csv('~/citations.txt', sep='\t', index=False, header=False)

In [1]:
import pandas as pd

In [2]:
citations = pd.read_csv('~/citations.txt', sep='\t')

In [6]:
citations = citations[['3930269','3448940']]

In [9]:
merged = pd.read_csv('data/patent_class_citation.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
merged.GDate.min(), merged.GDate.max()

('1975-03-18', '2010-12-28')

In [28]:
for x in range(1974,2010):
    print x, merged[merged.GYear > x].shape[0]

 1974 21002158
1975 20729019
1976 20384989
1977 20055892
1978 19718669
1979 19462885
1980 19115000
1981 18728016
1982 18379384
1983 18031939
1984 17610130
1985 17142211
1986 16672198
1987 16098901
1988 15541861
1989 14824878
1990 14145615
1991 13410113
1992 12633452
1993 11819795
1994 10920670
1995 9970651
1996 8893500
1997 7777195
1998 7199583
1999 6004054
2000 5364993
2001 4672466
2002 3247420
2003 1676031
2004 45064
2005 45064
2006 45064
2007 45064
2008 45064
2009 45064


In [35]:
patents_2003 = merged[merged.GYear == 2003]

In [39]:
for c in set(merged.primary_class):
    print c, merged[merged.primary_class == c].shape[0]

nan 0
1.0 38
2.0 69877
4.0 50355
5.0 65023
nan 0
7.0 4534
8.0 39609
12.0 3305
14.0 7036
15.0 76765
16.0 27225
19.0 10532
23.0 2502
24.0 52111
26.0 4121
27.0 2915
28.0 10696
29.0 200859
30.0 44232
33.0 58082
34.0 36474
36.0 42564
37.0 24571
38.0 5249
40.0 43543
42.0 24410
43.0 53253
44.0 25647
47.0 37802
48.0 10435
49.0 32969
51.0 16813
52.0 180691
53.0 93273
54.0 2681
55.0 29486
56.0 41074
57.0 29813
59.0 3268
60.0 122568
62.0 127436
63.0 5678
65.0 42551
66.0 14662
68.0 11850
69.0 592
70.0 52540
71.0 6991
72.0 64961
73.0 238825
74.0 95039
75.0 38833
76.0 5099
79.0 16
81.0 32653
82.0 13433
83.0 54897
84.0 45894
86.0 3240
87.0 2169
89.0 22844
91.0 22544
92.0 20528
95.0 51715
96.0 37640
99.0 56624
100.0 22479
101.0 68149
102.0 39209
104.0 17362
105.0 17146
106.0 77897
108.0 28159
109.0 3823
110.0 25048
111.0 12034
112.0 29745
114.0 44834
116.0 14876
117.0 19682
118.0 60009
119.0 52051
122.0 16338
123.0 224660
124.0 19027
125.0 5846
126.0 56291
127.0 6094
128.0 111242
131.0 25747
132.0 253

In [40]:
cls_715 = merged[merged.primary_class == 715.0]

In [42]:
cls_715 = cls_715[['Patent', 'primary_subclass', 'Citation', 'GDate']]

In [None]:
cls_715.to_

In [44]:
len(cls_715.primary_subclass.unique())

226

In [8]:
citations.columns = [['Patent', 'Citation']]

In [22]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [23]:
re.findall('\-', '53-068124')

['-']

In [31]:
import re
def int_type(patent_num):
    '''Returns 1 if patent_num string is formatted
    as a non-int'''
    if type(patent_num) != str:
        return 1
    else:
        return 0

In [4]:
citations_clean = citations[citations['Int_type'] == 1]

KeyError: 'Int_type'

In [33]:
citations_clean.head()

Unnamed: 0,Patent,Citation,Hyphen,Int_type
0,3930269,3448940,1,1
1,3930269,3488058,1,1
2,3930269,3533633,1,1
3,3930269,3610553,1,1
4,3930269,3620479,1,1


In [34]:
citations.txt 

((21001507, 4), (20709376, 4))

In [None]:
citations_utility.to_csv('~/citations.txt', sep='\t', index=False, header=False)

**TO DO**
- Get cluster coefficient for different points in time, plot
- Do visualization for one class
- Get visualization of entire patent space for patents that have pagerank some minimum outdegree

In [1]:
import pandas as pd

In [2]:
cls_715 = pd.read_csv('data/cls_715.csv')

In [5]:
cls_715 = cls_715[['Patent','Citation']]

In [7]:
cls_715.to_csv('data/cls_715.csv')