In [0]:
val filePath = "NYPD_Complaint_Data_Historic.csv"

val rawDF = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(filePath)

z.show(rawDF)

In [1]:
val baseDF = rawDF.select(
  "cmplnt_num",
  "cmplnt_fr_dt",
  "cmplnt_fr_tm",
  "cmplnt_to_dt",
  "cmplnt_to_tm",
  "rpt_dt",
  "ofns_desc",
  "law_cat_cd",
  "boro_nm",
  "susp_age_group",
  "susp_race",
  "susp_sex",
  "latitude",
  "longitude",
  "vic_age_group",
  "vic_race",
  "vic_sex",
)

baseDF.cache().count

In [2]:
z.show(baseDF)

In [3]:
val baseDF1 = baseDF.na.drop()

In [4]:
z.show(baseDF1)

In [5]:
baseDF1.printSchema

In [6]:
baseDF1.count()

In [7]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

val baseDF2 = baseDF1.withColumn("cmplnt_fr_dt", to_date(col("cmplnt_fr_dt"), "MM/dd/yyyy")).
                                withColumn("cmplnt_to_dt",  to_date(col("cmplnt_fr_dt"), "MM/dd/yyyy")).
                                withColumn("rpt_dt",  to_date(col("cmplnt_fr_dt"), "MM/dd/yyyy"))



In [8]:
z.show(baseDF2)

In [9]:
baseDF2.printSchema

In [10]:
val startDate = "2020-01-01"
val endDate = "2022-12-31"

val baseDF3 = baseDF2.filter($"rpt_dt" >= startDate && $"rpt_dt" <= endDate)

In [11]:
z.show(baseDF3)

In [12]:
baseDF3.count()

In [13]:
val allowedAges = Seq("<18", "18-24", "25-44", "45-64", "65+")
val baseDF3_1 = baseDF3.withColumn("susp_age_group", when(col("susp_age_group").isin(allowedAges: _*), col("susp_age_group")).otherwise(null))
                        .withColumn("vic_age_group", when(col("vic_age_group").isin(allowedAges: _*), col("vic_age_group")).otherwise(null))

//val baseDF4 = baseDF3.filter(
//  !baseDF3.columns.map(colName => col(colName).isin("(null)", "UNKNOWN")).reduce(_ || _)
//)

In [14]:
val columns = baseDF3_1.columns
val replacementValues = Seq("(null)", "UNKNOWN")
val baseDF4 = columns.foldLeft(baseDF3_1) { (tempDF, colName) =>
  tempDF.withColumn(colName, when(col(colName).isin(replacementValues: _*), null).otherwise(col(colName)))
}

In [15]:
z.show(baseDF4)

In [16]:
baseDF4.count()

In [17]:
z.show(baseDF4.describe())

In [18]:
import org.apache.spark.sql.types._

val baseDF5 = baseDF4.withColumn("latitude", col("latitude").cast(DoubleType))
                .withColumn("longitude", col("longitude").cast(DoubleType))
                .withColumn("cmplnt_fr_tm", date_format(to_timestamp(col("cmplnt_fr_tm"), "HH:mm:ss"), "HH:mm:ss"))
                .withColumn("cmplnt_to_tm", date_format(to_timestamp(col("cmplnt_to_tm"), "HH:mm:ss"), "HH:mm:ss"))


In [19]:
z.show(baseDF5)

In [20]:
baseDF5.printSchema

In [21]:
z.show(baseDF5.select("boro_nm").distinct())

In [22]:
z.show(baseDF5.select("vic_age_group").distinct())

In [23]:
z.show(baseDF5.select("susp_race").distinct())

In [24]:
z.show(baseDF5.select("ofns_desc").distinct())

In [25]:
z.show(baseDF5.filter(col("longitude").isNull))

In [26]:
val filePath = "zipcodes.csv"

val zipcodes = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "false")
  .option("escape", "\"")
  .csv(filePath)

z.show(zipcodes)

In [27]:
val zipcodesDF = zipcodes.withColumn("LAT", col("LAT").cast(DoubleType))
                         .withColumn("LNG", col("LNG").cast(DoubleType))

In [28]:
val filePath = "nyc-zip-codes.csv"

val nycZipcodes = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "false")
  .option("escape", "\"")
  .csv(filePath)
  .select("ZipCode")
  .map(_.getString(0))
  .collect()
  .toList

In [29]:
val newZipcodes = zipcodesDF.filter(col("ZIP").isin(nycZipcodes: _*)).collect()
print(newZipcodes, newZipcodes.length)

In [30]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.broadcast
import org.apache.spark.sql.functions.udf

def haversine(lat1: Double, lon1: Double, lat2: Double, lon2: Double): Double = {
    val latDistance = Math.toRadians(lat1 - lat2)
    val lngDistance = Math.toRadians(lon1 - lon2)
    val sinLat = Math.sin(latDistance / 2)
    val sinLng = Math.sin(lngDistance / 2)
    val a = sinLat * sinLat +
    (Math.cos(Math.toRadians(lat1)) *
        Math.cos(Math.toRadians(lat2)) *
        sinLng * sinLng)
    val c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
    (6371 * c).toInt
}

val haversineUDF = udf(haversine _)

val newZipcodeBrod = sc.broadcast(newZipcodes)

val getNearestZipCode = udf((lat1: Double, lon1: Double) => {
    var minDistance = Double.MaxValue
    var nearestZipCode = ""

    newZipcodeBrod.value.foreach { row =>
        val lat2 = row.getAs[Double]("LAT")
        val lon2 = row.getAs[Double]("LNG")
        val distance = haversine(lat1, lon1, lat2, lon2)
        if (distance < minDistance) {
            minDistance = distance
            nearestZipCode = row.getAs[String]("ZIP")
        }
    }
    nearestZipCode
})

val result = baseDF5.withColumn("zipcode", getNearestZipCode(col("latitude"), col("longitude")))

In [31]:
z.show(result)

In [32]:
val outputPath = "hdfs:///user/kn2359_nyu_edu/complaints"
result.write.option("header", "true").csv(outputPath)

In [33]:
print(result.count())

In [34]:
val filePath = "complaints"

val mydf = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(filePath)

print(mydf.count())

In [35]:
z.show(mydf.select("susp_age_group").distinct())

In [36]:

val stanleyPath = "/user/lsj3272_nyu_edu/shared/reviews/nyc_businesses.csv"

val stanleyDf =  spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(stanleyPath)

In [37]:
val joinedDf = mydf.join(stanleyDf, stanleyDf("zip") === mydf("zipcode"))

In [38]:
val category = joinedDf.select("category").withColumn("cats",  explode(split(col("category"), ".")))
                        .drop("category")

In [39]:
val catDF1 = stanleyDf.select("category").withColumn("cats",  explode(split(col("category"), "\\."))).drop("category")
val catDF2 = catDF1.groupBy("cats").agg(count("cats").alias("count")).orderBy(desc("count"))
val dogDF3 = catDF2.withColumn("cats", lower(trim(col("cats"))))
z.show(catDF3)

In [40]:
val mydf1 = mydf.groupBy("zipcode").agg(count("zipcode").alias("cc_count")).orderBy(desc("cc_count"))
z.show(mydf1)

In [41]:
val joinedDf = mydf1.join(stanleyDf, stanleyDf("zip") === mydf1("zipcode"))

In [42]:
val catDF1 = joinedDf.withColumn("cats",  explode(split(col("category"), "\\."))).drop("category")
val catDF2 = catDF1.groupBy("cats").agg(sum("cc_count").alias("count")).orderBy(desc("count"))
val catDF3 = catDF2.withColumn("cats", lower(trim(col("cats"))))
z.show(catDF3)

In [43]:
print(joinedDf.count())

In [44]:
print(mydf.count())

In [45]:
print(stanleyDf.count())

In [46]:
val stanleyPath = "/user/lsj3272_nyu_edu/shared/reviews/nyc_businesses.csv"

val bussinessDF =  spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(stanleyPath)
  
val numBusinessDF = bussinessDF.select("category").withColumn("cats",  explode(split(col("category"), "\\."))).drop("category")
                             .withColumn("cats", lower(trim(col("cats"))))
                             .withColumn("cats", regexp_replace(col("cats"), "(?i)(.*)(restaurant)(.*)", "restaurant"))
                             .withColumn("cats", regexp_replace(col("cats"), "(?i)(.*)(salon)(.*)", "salon"))
                             .withColumn("cats", regexp_replace(col("cats"), "cafe|coffee shop", "coffee shop"))
                             .groupBy("cats").agg(count("cats").alias("count")).orderBy(desc("count"))
                             .filter("count >= 100")

z.show(numBusinessDF)

In [47]:
val filePath = "complaints"

val myDF = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(filePath)

val catsDF = myDF
           //.filter("law_cat_cd = 'FELONY'")
           .filter("law_cat_cd = 'VIOLATION' OR law_cat_cd = 'MISDEMEANOR'")
           //.filter("law_cat_cd = 'MISDEMEANOR'")
           .groupBy("zipcode").agg(count("zipcode").alias("cc_count")).orderBy(desc("cc_count"))
           .join(bussinessDF, bussinessDF("zip") === myDF("zipcode"))
           .withColumn("cats1",  explode(split(col("category"), "\\."))).drop("category")
           .withColumn("cats1", lower(trim(col("cats1"))))
           .withColumn("cats1", regexp_replace(col("cats1"), "(?i)(.*)(restaurant)(.*)", "restaurant"))
           .withColumn("cats1", regexp_replace(col("cats1"), "(?i)(.*)(salon)(.*)", "salon"))
           .groupBy("cats1").agg(sum("cc_count").alias("my_count")).orderBy(desc("my_count"))
            
           
z.show(catsDF)

In [48]:
val finalDF1 = catsDF.join(numBusinessDF, numBusinessDF("cats") === catsDF("cats1")).drop("cats1")
z.show(finalDF1)

In [49]:
val finalDF2 = finalDF1.withColumn("crimes_per_cat", col("my_count")/col("count")).drop("my_count").orderBy(desc("crimes_per_cat"))
z.show(finalDF2)

In [50]:
val filePath = "complaints"

val myDF = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(filePath)

val catsDF = myDF
           .groupBy("zipcode", "law_cat_cd").agg(count("*").alias("cc_count")).orderBy(desc("cc_count"))
           .join(bussinessDF, bussinessDF("zip") === myDF("zipcode"))
           .withColumn("cats1",  explode(split(col("category"), "\\."))).drop("category")
           .withColumn("cats1", lower(trim(col("cats1"))))
           .withColumn("cats1", regexp_replace(col("cats1"), "(?i)(.*)(restaurant)(.*)", "restaurant"))
           .withColumn("cats1", regexp_replace(col("cats1"), "(?i)(.*)(salon)(.*)", "salon"))
           .groupBy("cats1", "law_cat_cd").agg(sum("cc_count").alias("num_complaints")).orderBy(desc("num_complaints"))

z.show(catsDF)

In [51]:
val finalDF = catsDF.join(numBusinessDF, numBusinessDF("cats") === catsDF("cats1")).drop("cats1")
                     .withColumn("crimes_per_cat", col("num_complaints")/col("count"))
                     .withColumnRenamed("cats", "bussiness")
                     .withColumnRenamed("count", "crimes")
                     .orderBy(desc("crimes_per_cat"))
z.show(finalDF)

In [52]:
val outputPath = "/user/lsj3272_nyu_edu/shared/crime_categories.csv"
finalDF.write.option("header", "true").csv(outputPath)

In [53]:
z.show(myDF.select("cmplnt_num", "rpt_dt", "law_cat_cd", "boro_nm", "latitude", "longitude", "zipcode"))