### Text Processing for Author Recognition using Spark

#### Import statements

In [44]:
import pyspark as ps    # import the spark suite
import warnings         # display warning if spark context already exists
import os
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, FloatType

#### Initialized Spark Context

In [3]:
try:
    sc = ps.SparkContext('local[4]') # create spark context to work locally on all available cpus
    print('created SparkContext')
except ValueError:
    warnings.warn('SparkContext already exists')    # issue a warning if context already exists

created SparkContext


### Read data.json into Spark SQL context

In [4]:
spark = ps.SQLContext(sc)
print('created SQLContext')

created SQLContext


In [5]:
data_file = 'data/data.json'
df = spark.read.json(data_file)

CONSIDER:

for fun use RDD and map reduce to remove the double bars I put into the excerpts!

In [6]:
print df.printSchema()
print df.count()
df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

None
9316
+--------------+--------------------+---------------+
|        author|             excerpt|          title|
+--------------+--------------------+---------------+
|CharlesDickens|﻿The Project Gute...|AChristmasCarol|
|CharlesDickens|Dickens gave his ...|AChristmasCarol|
|CharlesDickens|Dickens's greates...|AChristmasCarol|
+--------------+--------------------+---------------+
only showing top 3 rows



### Here we import some sql funtions and give our dataframe a SQL table name

In [7]:
from pyspark.sql.functions import length
from pyspark.sql.functions import count

df.createOrReplaceTempView("excerpts")

In [8]:
sqlDF = spark.sql("SELECT count(*) FROM excerpts WHERE author = 'MarkTwain'")
sqlDF.show()

+--------+
|count(1)|
+--------+
|    2349|
+--------+



### Lambda functions are created to explore the the character count, word count, and average word length

In [71]:
def char_count(text):
    return len(text)

def word_count(text):
    return len(text.split())

def avg_word_length(text):
    return sum([len(t) for t in text.split()]) / float(len(text.split()))

def sentence_count(text):
    return len(text.split('.'))

def sentence_length(text):
    return sum([len(t.split()) for t in text.split('.')]) / float(len(text.split('.'))) 

charcount_udf = udf(lambda x : char_count(x))
wordcount_udf = udf(lambda x: word_count(x))
avgwordlen_udf = udf(lambda x: avg_word_length(x))
sentencecount_udf = udf(lambda x: sentence_count(x))
sentencelength_udf = udf(lambda x: sentence_length(x))

df2 = df.withColumn("character_count", charcount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("word_count", wordcount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("avg_wordlen", avgwordlen_udf(df.excerpt).cast(FloatType())) \
        .withColumn("sent_count", sentencecount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("sent_length", sentencelength_udf(df.excerpt).cast(FloatType()))

df2.printSchema()
df2.show(10)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)
 |-- character_count: float (nullable = true)
 |-- word_count: float (nullable = true)
 |-- avg_wordlen: float (nullable = true)
 |-- sent_count: float (nullable = true)
 |-- sent_length: float (nullable = true)

+--------------+--------------------+---------------+---------------+----------+-----------+----------+-----------+
|        author|             excerpt|          title|character_count|word_count|avg_wordlen|sent_count|sent_length|
+--------------+--------------------+---------------+---------------+----------+-----------+----------+-----------+
|CharlesDickens|﻿The Project Gute...|AChristmasCarol|         1327.0|     221.0|  4.9864254|       9.0|   25.11111|
|CharlesDickens|Dickens gave his ...|AChristmasCarol|         2079.0|     356.0|  4.8426967|      14.0|  25.571428|
|CharlesDickens|Dickens's greates...|AChristmasCarol|         1341.0|     232.0|  4.7801

In [72]:
df2.createOrReplaceTempView("excerpts")
sqlDF = spark.sql("SELECT author, AVG(avg_wordlen), AVG(word_count), AVG(sent_length) FROM excerpts GROUP BY author")
sqlDF.show()

+--------------+-----------------+------------------+------------------+
|        author| avg(avg_wordlen)|   avg(word_count)|  avg(sent_length)|
+--------------+-----------------+------------------+------------------+
|     MarkTwain|4.474236786238636| 282.8884631758195| 24.85891974672656|
|      JohnMuir|4.701099736578512| 318.5234899328859|26.460818235772834|
|    JaneAusten|4.569085557100668| 271.3512075317233|  21.8411577078733|
|CharlesDickens|4.439227707129865|246.16694214876034| 21.05933634269336|
+--------------+-----------------+------------------+------------------+

