In [0]:
%python

df = spark.read.csv("dbfs:/FileStore/shared_uploads/nivimachinelearning@gmail.com/titanic_training_data-3.csv",header=True)

In [0]:
%python
df.show(1)

+-----------+--------+------+--------------------+----+---+-----+-----+---------+----+-----+--------+
|PassengerId|Survived|Pclass|                Name| Sex|Age|SibSp|Parch|   Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+----+---+-----+-----+---------+----+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|male| 22|    1|    0|A/5 21171|7.25| null|       S|
+-----------+--------+------+--------------------+----+---+-----+-----+---------+----+-----+--------+
only showing top 1 row



In [0]:
%python
# Display DataFrame schema
df.printSchema()

# Count the number of rows in the DataFrame
rowCount = df.count()
print(f"Number of rows: {rowCount}")

# Display summary statistics for numeric columns
df.describe().show()

# Filter and display passengers with age greater than 30
filtered_df = df.filter(df["Age"] > 30)
filtered_df.show()


root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

Number of rows: 891
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------

In [0]:
%python
# Extract a specific column as an RDD
age_rdd = df.select("Age").rdd.map(lambda row: row[0])

# Display the first few elements of the RDD
for age in age_rdd.take(5):
    print(age)


22
38
26
35
35


In [0]:
%python
# Convert age values to numbers in the RDD, handling None values and non-numeric values
age_number_rdd = age_rdd.map(lambda age: float(age) if age is not None and age.replace('.', '', 1).isdigit() else 0)

# Calculate the average age using RDD
total_age = age_number_rdd.reduce(lambda x, y: x + y)
average_age = total_age / rowCount
print(f"Average Age: {average_age}")




Average Age: 23.79929292929293


In [0]:
%python
# Register DataFrame as a temporary SQL table
df.createOrReplaceTempView("titanic_table")

# Perform SQL query on DataFrame
sql_result = spark.sql("SELECT Pclass, COUNT(*) AS count FROM titanic_table GROUP BY Pclass ORDER BY Pclass")
sql_result.show()


+------+-----+
|Pclass|count|
+------+-----+
|     1|  216|
|     2|  184|
|     3|  491|
+------+-----+



In [0]:
%scala
// Define the file path
val filePath = "/FileStore/shared_uploads/nivimachinelearning@gmail.com/titanic_training_data-3.csv"

// Read CSV into a DataFrame
val df = spark.read.option("header", "true").csv(filePath)

// Create a temporary SQL table
df.createOrReplaceTempView("titanic_table")



In [0]:
%scala
// Perform SQL query on DataFrame
val sqlResult = spark.sql("SELECT Pclass, COUNT(*) AS count FROM titanic_table GROUP BY Pclass ORDER BY Pclass")
sqlResult.show()



In [0]:
%scala
// Create an RDD from a collection
val data = List(1, 2, 3, 4, 5)
val rdd = sc.parallelize(data)

// Perform a transformation: Square each element
val squaredRDD = rdd.map(x => x * x)

// Perform an action: Print the squared values
squaredRDD.collect().foreach(println)


In [0]:
%scala
// Create a DataFrame from a collection
val data = List((1, "Alice"), (2, "Bob"), (3, "Charlie"))
val df = spark.createDataFrame(data).toDF("ID", "Name")

// Perform transformations using DataFrame API
val resultDF = df.select("ID", "Name").filter("ID > 1")

// Show the result
resultDF.show()
