# Let's use the famous Titanic dataset and perform the following operations using SAS:

1. Load the dataset from a CSV file
2. Explore and manipulate the dataset
3. Apply machine learning algorithm to predict survival using logistic regression
4. Aggregate the data to get summary statistics
5. Select and drop columns as needed
6. Perform statistical and mathematical calculations

Here's the SAS code:

In [0]:
"""
/* Load the Titanic dataset from a CSV file */
proc import datafile='titanic.csv'
            out=titanic
            dbms=csv replace;
            getnames=yes;
run;

/* Explore the dataset */
proc contents data=titanic;
run;

proc print data=titanic(obs=5);
run;

/* Manipulate the dataset */
/* Create a new column for family size */
data titanic;
    set titanic;
    family_size = sibsp + parch + 1;
run;

/* Apply machine learning algorithm */
/* Predict survival using logistic regression */
/* Split data into training and testing sets */
data titanic_train titanic_test;
    set titanic;
    if mod(_n_, 5) = 0 then output titanic_test;
    else output titanic_train;
run;

/* Fit a logistic regression model */
proc logistic data=titanic_train;
    model survived = sex age fare class family_size / selection=stepwise;
    score data=titanic_test out=titanic_predicted;
run;

/* Aggregate the data */
/* Get summary statistics */
proc means data=titanic mean median min max n;
    var age fare family_size;
    class survived sex;
run;

/* Select and drop columns */
/* Select columns of interest */
proc sql;
    create table titanic_selected as
    select sex, age, fare, survived
    from titanic;
quit;

/* Drop columns */
data titanic_dropped;
    set titanic;
    drop name cabin;
run;

/* Perform statistical and mathematical calculations */
/* Calculate the correlation matrix */
proc corr data=titanic;
    var age fare family_size;
run;

/* Calculate the mean and standard deviation */
data titanic_stats;
    set titanic;
    mean_age = mean(age);
    std_dev_fare = std(fare);
run;
"""


print("This is my SAS code")

This is my SAS code


# SAS to PySpark Conversion

# Python to PySpark Conversion

In [0]:
#Load the Titanic dataset from a CSV file:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Titanic").getOrCreate()
titanic = spark.read.csv("dbfs:/FileStore/shared_uploads/purvajainpj123@gmail.com/titanic.csv", header=True, inferSchema=True)


In [0]:
#Explore the dataset:
titanic.printSchema()
titanic.show(5)

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Siblings/Spouses Aboard: integer (nullable = true)
 |-- Parents/Children Aboard: integer (nullable = true)
 |-- Fare: double (nullable = true)

+--------+------+--------------------+------+----+-----------------------+-----------------------+-------+
|Survived|Pclass|                Name|   Sex| Age|Siblings/Spouses Aboard|Parents/Children Aboard|   Fare|
+--------+------+--------------------+------+----+-----------------------+-----------------------+-------+
|       0|     3|Mr. Owen Harris B...|  male|22.0|                      1|                      0|   7.25|
|       1|     1|Mrs. John Bradley...|female|38.0|                      1|                      0|71.2833|
|       1|     3|Miss. Laina Heikk...|female|26.0|                      0|                      0|  7.925|
|       1|     1|M

In [0]:
#Manipulate the dataset:
from pyspark.sql.functions import col

titanic = titanic.withColumn("family_size", col("Siblings/Spouses Aboard") + col("Parents/Children Aboard") + 1)

titanic.show(5)


+--------+------+--------------------+------+----+-----------------------+-----------------------+-------+-----------+
|Survived|Pclass|                Name|   Sex| Age|Siblings/Spouses Aboard|Parents/Children Aboard|   Fare|family_size|
+--------+------+--------------------+------+----+-----------------------+-----------------------+-------+-----------+
|       0|     3|Mr. Owen Harris B...|  male|22.0|                      1|                      0|   7.25|          2|
|       1|     1|Mrs. John Bradley...|female|38.0|                      1|                      0|71.2833|          2|
|       1|     3|Miss. Laina Heikk...|female|26.0|                      0|                      0|  7.925|          1|
|       1|     1|Mrs. Jacques Heat...|female|35.0|                      1|                      0|   53.1|          2|
|       0|     3|Mr. William Henry...|  male|35.0|                      0|                      0|   8.05|          1|
+--------+------+--------------------+------+---

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

# Split data into training and testing sets
titanic_train, titanic_test = titanic.randomSplit([0.8, 0.2], seed=42)

# Convert categorical variable to numerical variable
sex_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
sex_encoder = OneHotEncoder(inputCols=['SexIndex'], outputCols=['SexVec'])
assembler = VectorAssembler(inputCols=['SexVec', 'Age', 'Fare', 'Pclass', 'family_size'], outputCol='features')
titanic_train = sex_indexer.fit(titanic_train).transform(titanic_train)
titanic_train = sex_encoder.fit(titanic_train).transform(titanic_train)
titanic_train = assembler.transform(titanic_train)
titanic_test = sex_indexer.fit(titanic_test).transform(titanic_test)
titanic_test = sex_encoder.fit(titanic_test).transform(titanic_test)
titanic_test = assembler.transform(titanic_test)

# Fit a logistic regression model
lr = LogisticRegression(featuresCol='features', labelCol='Survived', maxIter=10)
model = lr.fit(titanic_train)

# Make predictions on the test set
predictions = model.transform(titanic_test)

# Evaluate the model's accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Survived')
accuracy = evaluator.evaluate(predictions)
error = 1 - accuracy
print('Accuracy:', accuracy)
print('Error:', error)


Accuracy: 0.8620481927710847
Error: 0.1379518072289153


In [0]:
# Aggregate the data
# Get summary statistics
from pyspark.sql.functions import mean, median, min, max, count

summary = titanic.groupby(['Survived', 'Sex']).agg(
    mean('Age').alias('mean_age'),
    median('Age').alias('median_age'),
    min('Age').alias('min_age'),
    max('Age').alias('max_age'),
    mean('Fare').alias('mean_fare'),
    median('Fare').alias('median_fare'),
    min('Fare').alias('min_fare'),
    max('Fare').alias('max_fare'),
    mean('family_size').alias('mean_family_size'),
    median('family_size').alias('median_family_size'),
    min('family_size').alias('min_family_size'),
    max('family_size').alias('max_family_size'),
    count('*').alias('count')
)
summary.limit(5).toPandas()


Unnamed: 0,Survived,Sex,mean_age,median_age,min_age,max_age,mean_fare,median_fare,min_fare,max_fare,mean_family_size,median_family_size,min_family_size,max_family_size,count
0,0,female,24.419753,22.0,2.0,62.0,23.024385,15.2458,6.75,151.55,3.246914,2.0,1,11,81
1,1,male,27.428165,28.0,0.42,80.0,40.821484,26.2875,0.0,512.3292,1.743119,1.0,1,7,109
2,1,female,28.866953,28.0,0.75,63.0,51.938573,26.0,7.225,512.3292,2.030043,2.0,1,7,233
3,0,male,31.136853,28.0,1.0,74.0,22.06617,9.49165,0.0,263.0,1.653017,1.0,1,11,464


In [0]:
# Select and drop columns
# Select columns of interest
from pyspark.sql.functions import col

titanic_selected = titanic.select(col('Sex'), col('Age'), col('Fare'), col('Survived'))
titanic_selected.show(5)


+------+----+-------+--------+
|   Sex| Age|   Fare|Survived|
+------+----+-------+--------+
|  male|22.0|   7.25|       0|
|female|38.0|71.2833|       1|
|female|26.0|  7.925|       1|
|female|35.0|   53.1|       1|
|  male|35.0|   8.05|       0|
+------+----+-------+--------+
only showing top 5 rows



In [0]:
# Drop columns
titanic_dropped = titanic.drop('Name')
titanic_dropped.show(5)


+--------+------+------+----+-----------------------+-----------------------+-------+-----------+
|Survived|Pclass|   Sex| Age|Siblings/Spouses Aboard|Parents/Children Aboard|   Fare|family_size|
+--------+------+------+----+-----------------------+-----------------------+-------+-----------+
|       0|     3|  male|22.0|                      1|                      0|   7.25|          2|
|       1|     1|female|38.0|                      1|                      0|71.2833|          2|
|       1|     3|female|26.0|                      0|                      0|  7.925|          1|
|       1|     1|female|35.0|                      1|                      0|   53.1|          2|
|       0|     3|  male|35.0|                      0|                      0|   8.05|          1|
+--------+------+------+----+-----------------------+-----------------------+-------+-----------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import mean, stddev, lit

# Calculate the correlation matrix
corr_matrix = titanic.select('Age', 'Fare', 'family_size').toPandas().corr()
print(corr_matrix)

# Calculate the mean and standard deviation
titanic_stats = titanic.select('Age', 'Fare')

mean_age = titanic_stats.select(mean('Age')).first()[0]
std_dev_fare = titanic_stats.select(stddev('Fare')).first()[0]

titanic_stats = titanic_stats.withColumn('mean_age', lit(mean_age))
titanic_stats = titanic_stats.withColumn('std_dev_fare', lit(std_dev_fare))

titanic_stats.show(5)


                  Age      Fare  family_size
Age          1.000000  0.112329    -0.300297
Fare         0.112329  1.000000     0.216250
family_size -0.300297  0.216250     1.000000
+----+-------+------------------+-----------------+
| Age|   Fare|          mean_age|     std_dev_fare|
+----+-------+------------------+-----------------+
|22.0|   7.25|29.471443066516347|49.78204040017391|
|38.0|71.2833|29.471443066516347|49.78204040017391|
|26.0|  7.925|29.471443066516347|49.78204040017391|
|35.0|   53.1|29.471443066516347|49.78204040017391|
|35.0|   8.05|29.471443066516347|49.78204040017391|
+----+-------+------------------+-----------------+
only showing top 5 rows

