In [94]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [95]:
spark = SparkSession.builder.appName('loan_prediction').getOrCreate()

In [96]:
df=spark.read.json("/content/nyt2.json")

In [97]:
df.show(5)

+--------------------+--------------------+-----------------+-----------------+--------------------+-------------+-----------------+-------------+----+--------------+--------------------+-------------+
|                 _id|  amazon_product_url|           author| bestsellers_date|         description|        price|   published_date|    publisher|rank|rank_last_week|               title|weeks_on_list|
+--------------------+--------------------+-----------------+-----------------+--------------------+-------------+-----------------+-------------+----+--------------+--------------------+-------------+
|{5b4aa4ead3089013...|http://www.amazon...|    Dean R Koontz|{{1211587200000}}|Odd Thomas, who c...|   {NULL, 27}|{{1212883200000}}|       Bantam| {1}|           {0}|           ODD HOURS|          {1}|
|{5b4aa4ead3089013...|http://www.amazon...|  Stephenie Meyer|{{1211587200000}}|Aliens have taken...|{25.99, NULL}|{{1212883200000}}|Little, Brown| {2}|           {1}|            THE HOST|     

In [98]:
df.describe().show()

+-------+--------------------+---------------+--------------------+---------+------------------+
|summary|  amazon_product_url|         author|         description|publisher|             title|
+-------+--------------------+---------------+--------------------+---------+------------------+
|  count|               10195|          10195|               10195|    10195|             10195|
|   mean|                NULL|           NULL|                NULL|     NULL|1877.7142857142858|
| stddev|                NULL|           NULL|                NULL|     NULL| 370.9760613506458|
|    min|http://www.amazon...|        AJ Finn|                    |      ACE|  10TH ANNIVERSARY|
|    max|https://www.amazo...|various authors|’Tis for the Rebe...|allantine|               ZOO|
+-------+--------------------+---------------+--------------------+---------+------------------+



In [99]:
df.select("author").show(10)

+--------------------+
|              author|
+--------------------+
|       Dean R Koontz|
|     Stephenie Meyer|
|        Emily Giffin|
|   Patricia Cornwell|
|     Chuck Palahniuk|
|James Patterson a...|
|       John Sandford|
|       Jimmy Buffett|
|    Elizabeth George|
|      David Baldacci|
+--------------------+
only showing top 10 rows



In [100]:
df.select("author", "title", "price", "rank").show(10)

+--------------------+--------------------+-------------+----+
|              author|               title|        price|rank|
+--------------------+--------------------+-------------+----+
|       Dean R Koontz|           ODD HOURS|   {NULL, 27}| {1}|
|     Stephenie Meyer|            THE HOST|{25.99, NULL}| {2}|
|        Emily Giffin|LOVE THE ONE YOU'...|{24.95, NULL}| {3}|
|   Patricia Cornwell|           THE FRONT|{22.95, NULL}| {4}|
|     Chuck Palahniuk|               SNUFF|{24.95, NULL}| {5}|
|James Patterson a...|SUNDAYS AT TIFFANY’S|{24.99, NULL}| {6}|
|       John Sandford|        PHANTOM PREY|{26.95, NULL}| {7}|
|       Jimmy Buffett|          SWINE NOT?|{21.99, NULL}| {8}|
|    Elizabeth George|     CARELESS IN RED|{27.95, NULL}| {9}|
|      David Baldacci|     THE WHOLE TRUTH|{26.99, NULL}|{10}|
+--------------------+--------------------+-------------+----+
only showing top 10 rows



In [101]:
df = df.withColumn("is_odd_hours",F.when(F.col("title") == "ODD HOURS", 1).otherwise(0))
df.select("title","is_odd_hours").show(10)

+--------------------+------------+
|               title|is_odd_hours|
+--------------------+------------+
|           ODD HOURS|           1|
|            THE HOST|           0|
|LOVE THE ONE YOU'...|           0|
|           THE FRONT|           0|
|               SNUFF|           0|
|SUNDAYS AT TIFFANY’S|           0|
|        PHANTOM PREY|           0|
|          SWINE NOT?|           0|
|     CARELESS IN RED|           0|
|     THE WHOLE TRUTH|           0|
+--------------------+------------+
only showing top 10 rows



In [102]:
df[df.author.isin("John ","em ")]

DataFrame[_id: struct<$oid:string>, amazon_product_url: string, author: string, bestsellers_date: struct<$date:struct<$numberLong:string>>, description: string, price: struct<$numberDouble:string,$numberInt:string>, published_date: struct<$date:struct<$numberLong:string>>, publisher: string, rank: struct<$numberInt:string>, rank_last_week: struct<$numberInt:string>, title: string, weeks_on_list: struct<$numberInt:string>, is_odd_hours: int]

In [103]:
df.where(df.author.isin('Emily Giffin','John Sandford')).select(df.author).show()

+-------------+
|       author|
+-------------+
| Emily Giffin|
|John Sandford|
| Emily Giffin|
|John Sandford|
| Emily Giffin|
|John Sandford|
| Emily Giffin|
| Emily Giffin|
|John Sandford|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
|John Sandford|
|John Sandford|
+-------------+
only showing top 20 rows



In [104]:
df.filter(df.author.isin('Emily Giffin','John Sandford')).select(df.author).show()

+-------------+
|       author|
+-------------+
| Emily Giffin|
|John Sandford|
| Emily Giffin|
|John Sandford|
| Emily Giffin|
|John Sandford|
| Emily Giffin|
| Emily Giffin|
|John Sandford|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
| Emily Giffin|
|John Sandford|
|John Sandford|
+-------------+
only showing top 20 rows



In [105]:
df=df.withColumnRenamed("amazon_product_url","URL")
df.show()

+--------------------+--------------------+--------------------+-----------------+--------------------+-------------+-----------------+--------------------+----+--------------+--------------------+-------------+------------+
|                 _id|                 URL|              author| bestsellers_date|         description|        price|   published_date|           publisher|rank|rank_last_week|               title|weeks_on_list|is_odd_hours|
+--------------------+--------------------+--------------------+-----------------+--------------------+-------------+-----------------+--------------------+----+--------------+--------------------+-------------+------------+
|{5b4aa4ead3089013...|http://www.amazon...|       Dean R Koontz|{{1211587200000}}|Odd Thomas, who c...|   {NULL, 27}|{{1212883200000}}|              Bantam| {1}|           {0}|           ODD HOURS|          {1}|           1|
|{5b4aa4ead3089013...|http://www.amazon...|     Stephenie Meyer|{{1211587200000}}|Aliens have taken.

In [106]:
df=df.drop("bestsellers_date")

In [107]:
df=df.groupBy("author").count().show()

+--------------------+-----+
|              author|count|
+--------------------+-----+
|          James Frey|    2|
|    Elin Hilderbrand|   58|
|   Sharon Kay Penman|    2|
|         Lisa Genova|    7|
|        Will Allison|    1|
|   Patricia Cornwell|   64|
|       Laurie R King|    6|
|          Tea Obreht|    6|
|        Sarah Dunant|    1|
|        Tim Johnston|    1|
|          Sara Gruen|   13|
|Tom Clancy with P...|    9|
|         Andre Dubus|    1|
|        Terry Brooks|   19|
|        Daniel Silva|   69|
|Karen White, Beat...|    1|
|      Jackie Collins|   14|
|          Pat Conroy|   11|
|          Ann B Ross|    5|
|      Michael Savage|    3|
+--------------------+-----+
only showing top 20 rows

