# Let's use the famous Titanic dataset and perform the following operations using SAS:

1. Load the dataset from a CSV file
2. Explore and manipulate the dataset
3. Apply machine learning algorithm to predict survival using logistic regression
4. Aggregate the data to get summary statistics
5. Select and drop columns as needed
6. Perform statistical and mathematical calculations

Here's the SAS code:

In [7]:
"""
/* Load the Titanic dataset from a CSV file */
proc import datafile='titanic.csv'
            out=titanic
            dbms=csv replace;
            getnames=yes;
run;

/* Explore the dataset */
proc contents data=titanic;
run;

proc print data=titanic(obs=5);
run;

/* Manipulate the dataset */
/* Create a new column for family size */
data titanic;
    set titanic;
    family_size = sibsp + parch + 1;
run;

/* Apply machine learning algorithm */
/* Predict survival using logistic regression */
/* Split data into training and testing sets */
data titanic_train titanic_test;
    set titanic;
    if mod(_n_, 5) = 0 then output titanic_test;
    else output titanic_train;
run;

/* Fit a logistic regression model */
proc logistic data=titanic_train;
    model survived = sex age fare class family_size / selection=stepwise;
    score data=titanic_test out=titanic_predicted;
run;

/* Aggregate the data */
/* Get summary statistics */
proc means data=titanic mean median min max n;
    var age fare family_size;
    class survived sex;
run;

/* Select and drop columns */
/* Select columns of interest */
proc sql;
    create table titanic_selected as
    select sex, age, fare, survived
    from titanic;
quit;

/* Drop columns */
data titanic_dropped;
    set titanic;
    drop name cabin;
run;

/* Perform statistical and mathematical calculations */
/* Calculate the correlation matrix */
proc corr data=titanic;
    var age fare family_size;
run;

/* Calculate the mean and standard deviation */
data titanic_stats;
    set titanic;
    mean_age = mean(age);
    std_dev_fare = std(fare);
run;
"""


print("This is my SAS code")





This is my SAS code


# SAS to Python Conversion

In [8]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load the Titanic dataset from a CSV file
titanic = pd.read_csv('titanic.csv')



In [9]:
# Explore the dataset
print(titanic.info())
print(titanic.head())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int64  
 1   Pclass                   887 non-null    int64  
 2   Name                     887 non-null    object 
 3   Sex                      887 non-null    object 
 4   Age                      887 non-null    float64
 5   Siblings/Spouses Aboard  887 non-null    int64  
 6   Parents/Children Aboard  887 non-null    int64  
 7   Fare                     887 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB
None
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikk

In [12]:
# Manipulate the dataset
# Create a new column for family size
titanic['family_size'] = titanic['Siblings/Spouses Aboard'] + titanic['Parents/Children Aboard'] + 1
titanic.head(5)



Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,family_size
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25,2
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833,2
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925,1
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1,2
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05,1


In [18]:
# Apply machine learning algorithm
# Predict survival using logistic regression
# Split data into training and testing sets
titanic_train, titanic_test = train_test_split(titanic, test_size=0.2, random_state=42)

# Convert categorical variable to numerical variable
titanic_train = pd.get_dummies(titanic_train, columns=['Sex'], drop_first=True)
titanic_test = pd.get_dummies(titanic_test, columns=['Sex'], drop_first=True)

# Fit a logistic regression model
logreg = LogisticRegression(solver='liblinear')
features = ['Sex_male', 'Age', 'Fare', 'Pclass', 'family_size']
logreg.fit(titanic_train[features], titanic_train['Survived'])
titanic_test['predicted_survived'] = logreg.predict(titanic_test[features])
#To get the predicted survival values for the test set
predictions = logreg.predict(titanic_test[features])
#To get the accuracy score of the model
accuracy = logreg.score(titanic_test[features], titanic_test['Survived'])
print("Accuracy:",accuracy)
#To get the error rate
error = 1 - accuracy
print("Error:",error)



Accuracy: 0.7415730337078652
Error: 0.2584269662921348


In [19]:
# Aggregate the data
# Get summary statistics
print(titanic.groupby(['Survived', 'Sex'])['Age', 'Fare', 'family_size'].agg(['mean', 'median', 'min', 'max', 'count']))



                       Age                                Fare            \
                      mean median   min   max count       mean    median   
Survived Sex                                                               
0        female  24.419753   22.0  2.00  62.0    81  23.024385  15.24580   
         male    31.136853   28.0  1.00  74.0   464  22.066170   9.49165   
1        female  28.866953   28.0  0.75  63.0   233  51.938573  26.00000   
         male    27.428165   28.0  0.42  80.0   109  40.821484  26.28750   

                                       family_size                       
                   min       max count        mean median min max count  
Survived Sex                                                             
0        female  6.750  151.5500    81    3.246914    2.0   1  11    81  
         male    0.000  263.0000   464    1.653017    1.0   1  11   464  
1        female  7.225  512.3292   233    2.030043    2.0   1   7   233  
         male    0.000 

  print(titanic.groupby(['Survived', 'Sex'])['Age', 'Fare', 'family_size'].agg(['mean', 'median', 'min', 'max', 'count']))


In [30]:
# Select and drop columns
# Select columns of interest
titanic_selected = titanic[['Sex', 'Age', 'Fare', 'Survived']]
titanic_selected.head(5)



Unnamed: 0,Sex,Age,Fare,Survived
0,male,22.0,7.25,0
1,female,38.0,71.2833,1
2,female,26.0,7.925,1
3,female,35.0,53.1,1
4,male,35.0,8.05,0


In [31]:
# Drop columns
titanic_dropped = titanic.drop(columns=['Name'])
titanic_dropped.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,family_size
0,0,3,male,22.0,1,0,7.25,2
1,1,1,female,38.0,1,0,71.2833,2
2,1,3,female,26.0,0,0,7.925,1
3,1,1,female,35.0,1,0,53.1,2
4,0,3,male,35.0,0,0,8.05,1


In [35]:
# Perform statistical and mathematical calculations
# Calculate the correlation matrix
corr_matrix = titanic[['Age', 'Fare', 'family_size']].corr()
print(corr_matrix)

# Calculate the mean and standard deviation
titanic_stats = titanic[['Age', 'Fare']]

titanic_stats['mean_age'] = titanic['Age'].mean()
titanic_stats.head(5)
titanic_stats['std_dev_fare'] = titanic['Fare'].std()
titanic_stats.head(5)



                  Age      Fare  family_size
Age          1.000000  0.112329    -0.300297
Fare         0.112329  1.000000     0.216250
family_size -0.300297  0.216250     1.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_stats['mean_age'] = titanic['Age'].mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_stats['std_dev_fare'] = titanic['Fare'].std()


Unnamed: 0,Age,Fare,mean_age,std_dev_fare
0,22.0,7.25,29.471443,49.78204
1,38.0,71.2833,29.471443,49.78204
2,26.0,7.925,29.471443,49.78204
3,35.0,53.1,29.471443,49.78204
4,35.0,8.05,29.471443,49.78204
