In [1]:
!pip install pyspark -qqq

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/chipotle-locations/us-states.json
/kaggle/input/chipotle-locations/chipotle_stores.csv


SQL Schema

In [4]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salary", FloatType(), True)
])

data = [("satya", 26, 0.01), ("satya_in", 26, 1000.0)]
df = spark.createDataFrame(data, schema)
df.show()
spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/07 16:02:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+--------+---+------+
|    name|age|salary|
+--------+---+------+
|   satya| 26|  0.01|
|satya_in| 26|1000.0|
+--------+---+------+



View SQL Requests

In [5]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")

data = [("satya", 26, 0.01), ("satya_in", 26, 1000.0), ("satya_norm", 26, 100.0)]

df = spark.createDataFrame(data, ["name", "age", "salary"])

# Convert into df into a table
df.createOrReplaceTempView("pers_table")

# SQL Query 
spark.sql("select * from pers_table").show()

                                                                                

+----------+---+------+
|      name|age|salary|
+----------+---+------+
|     satya| 26|  0.01|
|  satya_in| 26|1000.0|
|satya_norm| 26| 100.0|
+----------+---+------+



In [6]:
sql_query_1 = """
SELECT name, age, salary,
AVG(salary) OVER (PARTITION BY age) AS avg_salary
FROM pers_table
ORDER BY name
"""

result = spark.sql(sql_query_1)
result.show()
spark.stop()

                                                                                

+----------+---+------+----------+
|      name|age|salary|avg_salary|
+----------+---+------+----------+
|     satya| 26|  0.01|    366.67|
|  satya_in| 26|1000.0|    366.67|
|satya_norm| 26| 100.0|    366.67|
+----------+---+------+----------+



CSV Files

In [7]:
spark = SparkSession.builder.getOrCreate()
spark.read.csv('/kaggle/input/chipotle-locations/chipotle_stores.csv')

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string]

Some useful things to note when reading csv file:

* If your data contains a header; set header=True
* If you want to automatically determine column types and set them; set inferSchema=True
* To add an option to .csv, add it before .csv by using .option, we can set different settings for reading csv files here
* Set the delimiter (eg. via .option('delimiter',';') if you data is separated by ';'

In [8]:
spark.read.option('delimiter', ',')\
          .option('header', True)\
          .option('inferSchema', True)\
          .csv('/kaggle/input/chipotle-locations/chipotle_stores.csv').limit(5).show()

+-------+----------+--------------------+------------------+------------------+
|  state|  location|             address|          latitude|         longitude|
+-------+----------+--------------------+------------------+------------------+
|Alabama|    Auburn|346 W Magnolia Av...|32.606812966051244|-85.48732833164195|
|Alabama|Birmingham|300 20th St S Bir...|33.509721495414745|-86.80275567068401|
|Alabama|Birmingham|3220 Morrow Rd Bi...| 33.59558141391436|-86.64743684970283|
|Alabama|Birmingham|4719 Highway 280 ...| 33.42258214624579| -86.6982794650297|
|Alabama|   Cullman|1821 Cherokee Ave...| 34.15413376734492|-86.84122007667406|
+-------+----------+--------------------+------------------+------------------+



To set StructFields and define a type, we should know which types are available to us in pyspark

1. StringType: Represents string values.
2. IntegerType: Represents integer values.
3. LongType: Represents long integer values.
4. FloatType: Represents float values.
5. DoubleType: Represents double values.
6. BooleanType: Represents boolean values.
7. DateType: Represents date values.
8. TimestampType: Represents timestamp values.
9. ArrayType: Represents arrays of elements with a specific data type.
10. MapType: Represents key-value pairs with specific data types for keys and values.
11. StructType: Represents a structure or record with multiple fields.

In [9]:
from pyspark.sql.types import DateType, StringType, FloatType, IntegerType, TimestampType

schema = StructType([
    StructField("state", StringType(), True),
    StructField("location", StringType(), True),
    StructField("address", StringType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True)
])

df = spark.read.csv('/kaggle/input/chipotle-locations/chipotle_stores.csv', header=True, inferSchema=True, schema=schema)
df.show()

+-------+--------------+--------------------+---------+-----------+
|  state|      location|             address| latitude|  longitude|
+-------+--------------+--------------------+---------+-----------+
|Alabama|        Auburn|346 W Magnolia Av...| 32.60681|  -85.48733|
|Alabama|    Birmingham|300 20th St S Bir...| 33.50972|  -86.80276|
|Alabama|    Birmingham|3220 Morrow Rd Bi...| 33.59558|  -86.64744|
|Alabama|    Birmingham|4719 Highway 280 ...| 33.42258|  -86.69828|
|Alabama|       Cullman|1821 Cherokee Ave...|34.154133|  -86.84122|
|Alabama|        Hoover|1759 Montgomery H...| 33.37896|   -86.8038|
|Alabama|    Huntsville|5900 University D...| 34.74232|  -86.66572|
|Alabama|        Mobile|3871 Airport Blvd...|30.675339|  -88.14375|
|Alabama|        Mobile|7765 Airport Blvd...| 30.68273|    -88.225|
|Alabama|    Montgomery|2560 Berryhill Rd...|32.359177| -86.162254|
|Alabama|       Opelika|2125 Interstate D...| 32.61681|  -85.40448|
|Alabama|    Prattville|2566 Cobbs Ford R...|32.

In [10]:
from pyspark.sql.types import MapType

spark = SparkSession.builder.getOrCreate()
data = [(1, {"name": "satya", "salary": 0.01}), (2, {"name": "erik", "salary": 1000.0})]

df = spark.createDataFrame(data, ["id", "info"])
df.show()
spark.stop()

                                                                                

+---+--------------------+
| id|                info|
+---+--------------------+
|  1|{name -> satya, s...|
|  2|{name -> erik, sa...|
+---+--------------------+



PySpark contains a special function array_contains which allows you to check if a specified value exists in an array column. It returns a boolean value indicating whether the array contains the specified value

In [11]:
from pyspark.sql.functions import array_contains
spark = SparkSession.builder.getOrCreate()
data = [("a1", ["a", "ba", "ca"]), ("a2", ["aa", "bb"]), ("a3", ["bc", "cb"])]

df = spark.createDataFrame(data, ["key", "values"])
filter_df = df.where(array_contains(df.values, "bb"))
filter_df.show()

                                                                                

+---+--------+
|key|  values|
+---+--------+
| a2|[aa, bb]|
+---+--------+



In [12]:
filter_df2 = df.withColumn('contains', array_contains(df.values, "bb"))
filter_df2.show()

+---+-----------+--------+
|key|     values|contains|
+---+-----------+--------+
| a1|[a, ba, ca]|   false|
| a2|   [aa, bb]|    true|
| a3|   [bc, cb]|   false|
+---+-----------+--------+



Similar to SQL Select

In [13]:
select_df = df.select("values")
select_df.show()

+-----------+
|     values|
+-----------+
|[a, ba, ca]|
|   [aa, bb]|
|   [bc, cb]|
+-----------+



Similar to SQL WHERE

In [14]:
spark = SparkSession.builder.getOrCreate()
data = spark.read.csv('/kaggle/input/chipotle-locations/chipotle_stores.csv', header=True, inferSchema=True, schema=schema)
data.limit(5).show()

+-------+----------+--------------------+---------+---------+
|  state|  location|             address| latitude|longitude|
+-------+----------+--------------------+---------+---------+
|Alabama|    Auburn|346 W Magnolia Av...| 32.60681|-85.48733|
|Alabama|Birmingham|300 20th St S Bir...| 33.50972|-86.80276|
|Alabama|Birmingham|3220 Morrow Rd Bi...| 33.59558|-86.64744|
|Alabama|Birmingham|4719 Highway 280 ...| 33.42258|-86.69828|
|Alabama|   Cullman|1821 Cherokee Ave...|34.154133|-86.84122|
+-------+----------+--------------------+---------+---------+



In [15]:
from pyspark.sql import functions as f
filter_df = data.filter(data.state == "Maine")
filter_df = data.filter(f.col("state") == "Maine")
filter_df = data.filter((f.col("state") == "Maine") & (data.location == "Westbrook"))
filter_df.show()

+-----+---------+--------------------+---------+---------+
|state| location|             address| latitude|longitude|
+-----+---------+--------------------+---------+---------+
|Maine|Westbrook|11 Main St Suite ...|43.677555|-70.32975|
+-----+---------+--------------------+---------+---------+



GROUP BY

In [16]:
from pyspark.sql.functions import avg, count, expr
filter_df = data.groupBy("state")
# result = filter_df.agg(expr("count(name)"))
result = filter_df.agg(count(data.location).alias("location_count"))
result.show(51)

+--------------+--------------+
|         state|location_count|
+--------------+--------------+
|          Utah|            11|
|     Minnesota|            71|
|          Ohio|           193|
|      Arkansas|             6|
|        Oregon|            32|
|         Texas|           226|
|  North Dakota|             2|
|  Pennsylvania|            96|
|   Connecticut|            24|
|      Nebraska|            10|
|       Vermont|             2|
|        Nevada|            29|
|    Washington|            43|
|      Illinois|           144|
|      Oklahoma|            12|
|      Delaware|             9|
|    New Mexico|             9|
| West Virginia|             6|
|      Missouri|            39|
|  Rhode Island|             9|
|       Georgia|            61|
|       Montana|             3|
|      Michigan|            39|
|      Virginia|           107|
|North Carolina|            65|
|       Wyoming|             1|
|        Kansas|            30|
|    New Jersey|            69|
|      M

ORDER BY

In [17]:
data.orderBy(f.col("location").desc(), f.col("state").asc()).show(10)

+------------+----------+--------------------+---------+-----------+
|       state|  location|             address| latitude|  longitude|
+------------+----------+--------------------+---------+-----------+
|        Ohio|Zanesville|3581 Maple Ave Za...| 39.98919|  -82.02363|
|     Arizona|      Yuma|1525 S Yuma Palms...|32.699726| -114.60107|
|     Arizona|      Yuma|3080 S 4th Avenue...| 32.69988|-114.601006|
|  California| Yuba City|1005 Gray Ave Yub...|39.142582| -121.62976|
|        Ohio|Youngstown|320 Wick Ave Youn...|41.105312| -80.645325|
|Pennsylvania|      York|1923 Springwood R...|39.939648|  -76.69322|
|Pennsylvania|      York|2801 Concord Rd Y...|39.983067|  -76.66886|
|Pennsylvania|      York|890 Loucks Rd Yor...|39.979824|  -76.75195|
|    New York|   Yonkers|5510 Xavier Dr Sp...| 40.92685|  -73.85366|
|        Ohio|     Xenia|1620 W Park Squar...|39.689667|  -83.96204|
+------------+----------+--------------------+---------+-----------+
only showing top 10 rows



JOINS

In [18]:
df1 = spark.createDataFrame([(1, "John"), (2, "Alice"), (3, "Bob")], ["id", "name"])
df2 = spark.createDataFrame([(1, 25), (2, 30), (4, 35)], ["id", "age"])
df1.show()
df2.show()

+---+-----+
| id| name|
+---+-----+
|  1| John|
|  2|Alice|
|  3|  Bob|
+---+-----+

+---+---+
| id|age|
+---+---+
|  1| 25|
|  2| 30|
|  4| 35|
+---+---+



In [19]:
left_df = df1.join(df2, "id", "left")
outer_df = df1.join(df2, "id", "outer")
inner_df = df1.join(df2, "id", "inner")
left_df.show()
outer_df.show()
inner_df.show()

                                                                                

+---+-----+----+
| id| name| age|
+---+-----+----+
|  1| John|  25|
|  2|Alice|  30|
|  3|  Bob|NULL|
+---+-----+----+



                                                                                

+---+-----+----+
| id| name| age|
+---+-----+----+
|  1| John|  25|
|  2|Alice|  30|
|  3|  Bob|NULL|
|  4| NULL|  35|
+---+-----+----+





+---+-----+---+
| id| name|age|
+---+-----+---+
|  1| John| 25|
|  2|Alice| 30|
+---+-----+---+



                                                                                

User Defined Functions

In [20]:
from pyspark.sql.functions import udf
new_data = [("Alice", 25), 
        ("Bob", 30), 
        ("Charlie", 35)]
df = spark.createDataFrame(new_data, ["name", "age"])
def square(num):
    return num*num

square_udf = udf(square, IntegerType())

new_df = df.withColumn("square_value", square_udf(df["age"]))
new_df.show()

                                                                                

+-------+---+------------+
|   name|age|square_value|
+-------+---+------------+
|  Alice| 25|         625|
|    Bob| 30|         900|
|Charlie| 35|        1225|
+-------+---+------------+



                                                                                

In [21]:
# Using lambda function
sum_udf = udf(lambda x,y: x+y, IntegerType())
new_df2 = new_df.withColumn("add_value", sum_udf(new_df["age"], new_df["square_value"]))
new_df2.show()

[Stage 35:>                                                         (0 + 3) / 3]

+-------+---+------------+---------+
|   name|age|square_value|add_value|
+-------+---+------------+---------+
|  Alice| 25|         625|      650|
|    Bob| 30|         900|      930|
|Charlie| 35|        1225|     1260|
+-------+---+------------+---------+



                                                                                

In [22]:
# Array types
new_data = [(1, [10, 20, 30]), 
        (2, [15, 25, 35]), 
        (3, [12, 22, 32])]

df = spark.createDataFrame(new_data, ["id", "val"])

def avgs(arr):
    return sum(arr)/len(arr)

avg_udf = udf(avgs, FloatType())

new_df = df.withColumn("average", avg_udf(df["val"]))
new_df.show()



+---+------------+-------+
| id|         val|average|
+---+------------+-------+
|  1|[10, 20, 30]|   20.0|
|  2|[15, 25, 35]|   25.0|
|  3|[12, 22, 32]|   22.0|
+---+------------+-------+



                                                                                

PandasUDFType.SCALAR is a constant in PySpark that represents the type of pandas UDF (SCALAR)

A scalar pandas UDF takes one or more columns as input and returns a single column as output
It operates on a single row at a time and can be used to apply arbitrary Python functions to the data in a DataFrame

In [23]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
new_data = [("alice", 25), 
        ("bob", 30), 
        ("charlie", 35)]

df = spark.createDataFrame(new_data, ["name", "age"])

@pandas_udf(returnType="string", functionType=PandasUDFType.SCALAR)
def caps(name):
    return name.str.upper()

df = df.withColumn("caps", caps(df["name"]))
df.show()

[Stage 39:>                                                         (0 + 3) / 3]

+-------+---+-------+
|   name|age|   caps|
+-------+---+-------+
|  alice| 25|  ALICE|
|    bob| 30|    BOB|
|charlie| 35|CHARLIE|
+-------+---+-------+



                                                                                

PandasUDFType.GROUPED_AGG is also a constant in PySpark that represents the type of a Pandas user-defined function (UDF) for grouped aggregation, so it should be used with groupBy and agg

In [24]:
new_data = [("Alice", "A", 34), 
        ("Bob", "A", 28), 
        ("Charlie", "B", 45), 
        ("David", "B", 50)]

df = spark.createDataFrame(new_data, ["name", "section", "age"])

@pandas_udf(FloatType(), functionType = PandasUDFType.GROUPED_AGG)
def mean_age(age):
    return age.mean()

result = df.groupBy("section").agg(mean_age(df["age"]).alias("mean_age"))
result.show()

[Stage 42:>                                                         (0 + 1) / 1]

+-------+--------+
|section|mean_age|
+-------+--------+
|      A|    31.0|
|      B|    47.5|
+-------+--------+



                                                                                

While both GROUPED_MAP and GROUPED_AGG are used for grouped operations (via GroupBy), they serve different purposes:

GROUPED_MAP is used for applying custom transformations to each group
GROUPED_AGG is used for performing aggregate operations on each group

Pandas requires an index when creating a DataFrame with scalar values. In your UDF, you need to ensure that the return DataFrame has the correct structure and indices.

In [25]:
import pandas as pd
new_data = [
    ("Electronics", "2021-01-01", 1000),
    ("Electronics", "2021-02-01", 1500),
    ("Clothing", "2021-01-01", 800),
    ("Clothing", "2021-02-01", 1200),
    ("Clothing", "2021-03-01", 1500)
]

df = spark.createDataFrame(new_data, ["item", "date", "price"])

@pandas_udf("category string, total_sales double", PandasUDFType.GROUPED_MAP)
def sales_(pdf):
    cat = pdf["item"].iloc[0]
    tot = pdf["price"].sum()
    return pd.DataFrame({"category": [cat], "total_sales": [tot]})

result = df.groupBy("item").apply(sales_)
result.show()

[Stage 45:>                                                         (0 + 1) / 1]

+-----------+-----------+
|   category|total_sales|
+-----------+-----------+
|   Clothing|     3500.0|
|Electronics|     2500.0|
+-----------+-----------+



                                                                                

Calculate Monthly Active Users

In [26]:
new_data = [
    (1, '2022-01-01'),
    (2, '2022-01-02'),
    (3, '2022-01-03'),
    (1, '2022-02-01'),
    (2, '2022-02-02'),
    (3, '2022-02-03'),
    (4, '2022-02-04')
]

df = spark.createDataFrame(new_data, ["id", "date"])
df = df.withColumn("date", f.col("date").cast("date"))
df.show()

+---+----------+
| id|      date|
+---+----------+
|  1|2022-01-01|
|  2|2022-01-02|
|  3|2022-01-03|
|  1|2022-02-01|
|  2|2022-02-02|
|  3|2022-02-03|
|  4|2022-02-04|
+---+----------+



In [27]:
# Create a temp table
df.createOrReplaceTempView("activity")

In [28]:
sql = """
SELECT COUNT(DISTINCT(id))
FROM activity
WHERE date >= '2022-01-01' AND date <= '2022-12-01'
"""

spark.sql(sql).show()

+------------------+
|count(DISTINCT id)|
+------------------+
|                 4|
+------------------+



In [29]:
from pyspark.sql.functions import col, count, countDistinct
df.groupBy("date").agg(countDistinct(f.col("id")).alias("AU")).show()

+----------+---+
|      date| AU|
+----------+---+
|2022-02-01|  1|
|2022-02-04|  1|
|2022-02-03|  1|
|2022-01-03|  1|
|2022-02-02|  1|
|2022-01-01|  1|
|2022-01-02|  1|
+----------+---+



In [30]:
new_data = [("customer1", "2021-01-01", 100),
        ("customer1", "2021-01-02", 150), 
        ("customer2", "2021-01-01", 200), 
        ("customer2", "2021-01-02", 300)]

df = spark.createDataFrame(new_data, ["cust", "date", "amount"])
df.groupBy("cust").agg(f.sum(f.col("amount")).alias("total_amt")).show()

+---------+---------+
|     cust|total_amt|
+---------+---------+
|customer1|      250|
|customer2|      500|
+---------+---------+



ROI

In [31]:
new_data = [
    ("Project A", 100000, 150000),  # (Project Name, Cost of Investment, Net Profit)
    ("Project B", 80000, 120000),
    ("Project C", 120000, 90000)
]
df = spark.createDataFrame(new_data, ["name", "cost", "net_profit"])
df.show()

+---------+------+----------+
|     name|  cost|net_profit|
+---------+------+----------+
|Project A|100000|    150000|
|Project B| 80000|    120000|
|Project C|120000|     90000|
+---------+------+----------+



In [32]:
df = df.withColumn("ROI", (f.col("net_profit") / f.col("cost") * 100))
df.show()

+---------+------+----------+-----+
|     name|  cost|net_profit|  ROI|
+---------+------+----------+-----+
|Project A|100000|    150000|150.0|
|Project B| 80000|    120000|150.0|
|Project C|120000|     90000| 75.0|
+---------+------+----------+-----+



In [33]:
spark.stop()

Binarizer - is a Transformer which applies a threshold to a numeric field, turning it into 0s (below threshold) and 1s (above threshold)

The method itself is accessable from pyspark.ml.feature and requires a double input dtype
To convert data types after the scheme has been set or created, use withColumn w/ col().cast()

In [34]:
from pyspark.ml.feature import Binarizer
from pyspark.sql.functions import col, countDistinct, lag

spark = SparkSession.builder.appName("binarizer_").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

schema = StructType([
    StructField("cust_id", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), False),
    StructField("monthly_charges", IntegerType(), True),
    StructField("total_charges", IntegerType(), True),
    StructField("contract_len", IntegerType(), True),
    StructField("churn", StringType(), True)
])

data = [
    (1, 45, 'M', 75, 900, 12, 'No'),
    (2, 30, 'F', 60, 720, 6, 'Yes'),
    (3, 50, '', 85, 1020, 24, 'No'),
    (4, 35, 'F', 70, 840, 12, 'Yes'),
    (5, 55, 'M', 95, 1140, 24, 'No'),
    (6, 40, 'F', 80, 960, 6, 'No'),
    (7, 25, 'M', 55, 660, 6, 'Yes'),
    (8, 60, 'F', 100, 1200, 12, 'No'),
    (9, 50, 'M', 90, 1080, 24, 'No'),
    (10, 35, 'F', 65, 780, 6, 'Yes')
]

df = spark.createDataFrame(data, schema)
df.show()

                                                                                

+-------+---+------+---------------+-------------+------------+-----+
|cust_id|age|gender|monthly_charges|total_charges|contract_len|churn|
+-------+---+------+---------------+-------------+------------+-----+
|      1| 45|     M|             75|          900|          12|   No|
|      2| 30|     F|             60|          720|           6|  Yes|
|      3| 50|      |             85|         1020|          24|   No|
|      4| 35|     F|             70|          840|          12|  Yes|
|      5| 55|     M|             95|         1140|          24|   No|
|      6| 40|     F|             80|          960|           6|   No|
|      7| 25|     M|             55|          660|           6|  Yes|
|      8| 60|     F|            100|         1200|          12|   No|
|      9| 50|     M|             90|         1080|          24|   No|
|     10| 35|     F|             65|          780|           6|  Yes|
+-------+---+------+---------------+-------------+------------+-----+



In [35]:
df = df.withColumn("age", f.col("age").cast("double"))
df.show()

+-------+----+------+---------------+-------------+------------+-----+
|cust_id| age|gender|monthly_charges|total_charges|contract_len|churn|
+-------+----+------+---------------+-------------+------------+-----+
|      1|45.0|     M|             75|          900|          12|   No|
|      2|30.0|     F|             60|          720|           6|  Yes|
|      3|50.0|      |             85|         1020|          24|   No|
|      4|35.0|     F|             70|          840|          12|  Yes|
|      5|55.0|     M|             95|         1140|          24|   No|
|      6|40.0|     F|             80|          960|           6|   No|
|      7|25.0|     M|             55|          660|           6|  Yes|
|      8|60.0|     F|            100|         1200|          12|   No|
|      9|50.0|     M|             90|         1080|          24|   No|
|     10|35.0|     F|             65|          780|           6|  Yes|
+-------+----+------+---------------+-------------+------------+-----+



In [36]:
# Age > 30
binarise_ = Binarizer(threshold = 30, inputCol = "age", outputCol = "age_above_30")
df = binarise_.transform(df)
df.show()

+-------+----+------+---------------+-------------+------------+-----+------------+
|cust_id| age|gender|monthly_charges|total_charges|contract_len|churn|age_above_30|
+-------+----+------+---------------+-------------+------------+-----+------------+
|      1|45.0|     M|             75|          900|          12|   No|         1.0|
|      2|30.0|     F|             60|          720|           6|  Yes|         0.0|
|      3|50.0|      |             85|         1020|          24|   No|         1.0|
|      4|35.0|     F|             70|          840|          12|  Yes|         1.0|
|      5|55.0|     M|             95|         1140|          24|   No|         1.0|
|      6|40.0|     F|             80|          960|           6|   No|         1.0|
|      7|25.0|     M|             55|          660|           6|  Yes|         0.0|
|      8|60.0|     F|            100|         1200|          12|   No|         1.0|
|      9|50.0|     M|             90|         1080|          24|   No|      