In [None]:
install.packages("Sleuth3")
install.packages("tidyverse")
install.packages("ggplot2")
install.packages("car")
install.packages("ISLR")
install.packages("leaps")
install.packages("lattice")
install.packages("grid")
install.packages("hexbin")

install.packages("dplyr")
install.packages("Amelia")
install.packages("scales")
install.packages("caTools")
install.packages("ROCR")
install.packages("rpart")
install.packages("rpart.plot")
install.packages("randomForest")
install.packages("caret")                 
install.packages("plyr")
install.packages("mosaic")                
install.packages("corrgram")
install.packages("Hmisc") # Install Correlation Package


In [None]:
library(Sleuth3)
library(tidyverse) # For general data modeling, wrangling, and visualization tasks. The 
library(ggplot2) # Plots
library(ISLR) # Statistical Learning with Applications 
library(leaps)
library(lattice)
library(grid)
library(hexbin)
library(dplyr) # Data Manipulation
library(Amelia) # Missing Data: Missing Map
library(scales) # Visualization
library(caTools) # Prediction: Splitting Data
library(car) # Prediction: Checking Multicollinearity
library(ROCR) # Prediction: ROC Curve
library(rpart) # Prediction: Decision Tree
library(rpart.plot) # Prediction: Decision Tree
library(randomForest) # Prediction: Random Forest
library(caret) # Prediction: k-Fold Cross Validation
library(plyr) # For splitting data apart
library(mosaic) # Visualizing 


In [None]:
# Loading and Reading the Titanic test and dataset from the csv files.

Titanic.train <- read.csv("train.csv", header = TRUE, na.strings = "",  stringsAsFactors = FALSE)
Titanic.test <- read.csv("test.csv", header = TRUE, na.strings = "",  stringsAsFactors = FALSE)

tail(Titanic.train)
tail(Titanic.test)


class(Titanic.train)
class(Titanic.test)


str(Titanic.train)
str(Titanic.test)



In [None]:
# TITANIC DATASET VARIABLE DESCRIPTIONS:
# Survival        Survival (0 = No; 1 = Yes)
# Pclass          Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
# Name            Name
# Sex             Sex
# Age             Age
# SibSp           Number of Siblings/Spouses Aboard
# Parch           Number of Parents/Children Aboard
# Ticket          Ticket Number
# Fare            Passenger Fare
# Cabin           Cabin
# Cmbarked        Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)


# Evaluating the median for the train and test datasets
# Median of train and test datasets

median(Titanic.train$Age)

median(Titanic.train$Age, na.rm = TRUE)

median(Titanic.test$Age, na.rm = TRUE)

# TRAIN DATASET
# Titanic.train

str(Titanic.train)                 
head(Titanic.train)
names(Titanic.train)
ncol(Titanic.train) 
Titanic.train$IsTrainset <- TRUE
summary(Titanic.train)

# TEST DATASET
# Titanic.test

str(Titanic.test)                 
head(Titanic.test)
names(Titanic.test)
ncol(Titanic.test)   
Titanic.test$IsTrainset <- FALSE
summary(Titanic.test)


Titanic.train$IsTrainset <- TRUE
Titanic.test$IsTrainset <- FALSE

names(Titanic.train)
names(Titanic.test)

ncol(Titanic.train) 
ncol(Titanic.test) 


In [None]:
# COMBINING THE TRAIN AND TEST DATASETS
# Before combining the test and train dataset, we will need to create a new column called "Survived" 
# in the test data so that the number and name of both test and train are equal.

ncol(Titanic.train)
ncol(Titanic.test) 

Titanic.test$Survived <- NA

Titanic <- rbind(Titanic.train, Titanic.test)

dim(Titanic)
str(Titanic)
Titanic.train$IsTrainset
tail(Titanic.train$IsTrainset)
table(Titanic.train$IsTrainset)

Titanic$IsTrainset
tail(Titanic$IsTrainset)
table(Titanic$IsTrainset)

# Transform numeric variable to the Factor Categorical Type

Titanic.train$Survived = factor(Titanic.train$Survived)
Titanic.train$Survived

Titanic.test$Survived = factor(Titanic.test$Survived)
Titanic.test$Survived


Titanic.train$Pclass = factor(Titanic.train$Pclass)
Titanic.train$Pclass

Titanic.test$Pclass = factor(Titanic.test$Pclass)
Titanic.test$Pclass



In [None]:
# Checking the structure of the full Titanic dataset

str(Titanic)
tail(Titanic)

table(Titanic$IsTrainset) # Test and Train Datasets.

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

# PRE-PROCESSING OF FULL TITANIC DATASET AFTER COMBINING THE TRAIN AND TEST DATA.
# HANDLING AND CHECKING MISSING DATASETS

# Checking missing values (missing values or empty values)

colSums(is.na(Titanic))
colSums(is.na(Titanic)|Titanic=='')

sapply(Titanic, function(x) length(unique(x)))

# From the Dataset Table Columns
# Cabin has the most number of missing values, 1014 values. 
# Age has 263 missing values while Embarked and Fare have two and one missing values, respectively.


# EXPLORING ALL MISSING DATASETS

table(Titanic$PassengerId)
table(Titanic$Survived) # Passengers who survided or who died.
table(Titanic$Age)
table(Titanic$Embarked)
table(Titanic$Fare)
table(Titanic$Ticket)      
table(Titanic$SibSp)
table(Titanic$Pclass) # Passengers Cabin Class
table(Titanic$Sex) # Sex of Passengers
table(Titanic$PassengerId)
table(Titanic$Name)
table(Titanic$Parch)
table(Titanic$Cabin)


table(Titanic$Survived)
prop.table(Titanic$Survived)
table(Titanic$Sex, Titanic$Survived) # Sex of Passengers who Survived the Titanic
prop.table(table(Titanic$Sex,Titanic$Survived),margin=1)
table(Titanic$Sex, Titanic$Pclass) # Sex of the Pclass

# The RMS Titanic consisted of ten decks
# The Interior had 1st, 2nd, and 3rd Class commonly referred to as "steerage"). 
# Titanic's passengers ranged from the rich and famous, to the poor and obscure. 
# Each passenger class was housed in different areas of the ship and each experienced different 
# levels of luxury and comfort.
# Although 3rd class accommodations were not as elegant as 1st or 2nd class, they were
# still very nice, especially compared to other ships.


# FIND ALL MISSING VALUES IN THE DATASETS
# The missmap` function considers "NA" values as missing values but it does not consider empty values as missing values. 
# Missmap` allows us to explore how much missing data we have.


?missmap

# Missingness map showing where missingness occurs in the Titanic dataset. 

missmap(Titanic, main="Titanic Data - Missings Map",
        col=c("yellow", "blue"), legend=FALSE)

# Working on Missing Embarked Column Values
# We ignore replacing the values for Cabin since it will not be used in the predriction model

# Removing Cabin as it has very high missing values, passengerId, Ticket and Name are not required

li


In [None]:
library(dplyr)

str(Titanic)

# Replace missing values for Embarked Column Data

table(Titanic$IsTransitset)

table(Titanic$Embarked)

table(Titanic.train$Embarked)

table(Titanic.train$Embarked, useNA = "always")

Titanic.train$Embarked[which(is.na(Titanic.train$Embarked))] = 'S';

table(Titanic.train$Embarked)

# Created a column called FamilySize for Titanic Passengers

Titanic$FamilySize <- Titanic$SibSp+Titanic$Parch+1 # FamilySize

head(Titanic$FamilySize)
str(Titanic$FamilySize)
table(Titanic$FamilySize)
which(is.na(Titanic$FamilySize))
summary(Titanic$FamilySize)



In [None]:
# CLEANING AND INPUT OF ALL MISSING DATASET FOR AGE, FARE & PCLASS. 
# The next step is to fill all the missing dataset rows instead of just dropping them from the table. 

sum(is.na(Titanic.train$Age) == TRUE) /  length(Titanic.train$Age)

# Finding the missing values for the entire dataset

sapply(Titanic.train, function(attribute) {sum(is.na(attribute)==TRUE)/ length(attribute)
  ;}) 

sapply(Titanic.test, function(attribute) {sum(is.na(attribute)==TRUE)/ length(attribute)
  ;})

# Check number of uniques values for each of the column to find out columns which we can convert to Factors

sapply(Titanic, function(x) length(unique(x)))

sapply(Titanic.train, function(x) length(unique(x)))

sapply(Titanic.test, function(x) length(unique(x)))


# Cleaning and Input of Missing Age Attribute

table(is.na(Titanic$Age))

Median.Age <- median(Titanic$Age, na.rm = TRUE)

Titanic[is.na(Titanic$Age), "Age"] <- Median.Age

table(is.na(Titanic$Age))



In [None]:
# Cleaning and Input of Missing Fare Data

table(is.na(Titanic$Fare))

Median.Fare <- median(Titanic$Fare, na.rm = TRUE)

Titanic[is.na(Titanic$Fare), "Fare"] <- Median.Fare

colSums(is.na(Titanic)|Titanic=='') 

table(is.na(Titanic$Fare))


# Cleaning and Input of Missing Pclass Data

table(is.na(Titanic$Pclass))

Median.Pclass <- median(Titanic$Pclass, na.rm = TRUE)

table(is.na(Titanic$Pclass))




In [None]:
# FEATURE ENGINEERING
# Passenger Title Name

head(Titanic$Name)


# Passenger title from passenger name

Titanic$Title <- gsub("^.*, (.*?)\\..*$", "\\1", Titanic$Name)


# Frequency of each title by sex

table(Titanic$Sex, Titanic$Title)

# CREATING CATEGORICAL FEATURES FOR VARIABLES AND CASTING
# Created FamilySize for Titanic Passengers

Titanic$FamilySize <- Titanic$SibSp+Titanic$Parch+1

head(Titanic$FamilySize)
str(Titanic$FamilySize)
table(Titanic$FamilySize)
which(is.na(Titanic$FamilySize))
summary(Titanic$FamilySize)

Titanic$FamilySize <- sapply(1:nrow(Titanic), function(x)
  ifelse(Titanic$FamilySize[x] ==1, "Single",
         ifelse(Titanic$FamilySize[x] >4, "Large", "Small")))

table(Titanic$Survived)

str(Titanic)



Titanic$Pclass <- as.factor(Titanic$Pclass)

Titanic$Sex <- as.factor(Titanic$Sex)

Titanic$Embarked <- as.factor(Titanic$Embarked)

Titanic$Survived = factor(Titanic$Survived)

Titanic$Pclass = factor(Titanic$Pclass)

Titanic$Sex = factor(Titanic$Sex)

Titanic$Embarked = factor(Titanic$Embarked)

Titanic$FamilySize = factor(Titanic$FamilySize, levels=c("Single", "Small", "Large"))

str(Titanic)


In [None]:
head(Titanic)

Titanic.train$Survived <- as.factor(Titanic.train$Survived)

Titanic$FamilySize <- Titanic$SibSp+Titanic$Parch +1

str(Titanic)

head(Titanic)

summary(Titanic)

# Variables into Characters

names(Titanic)

summary(Titanic$Age) 

# CATEGORICAL CASTING OF THREE ATTRIBUTES

Titanic$Pclass <- as.factor(Titanic$Pclass)


Titanic$Sex <- as.factor(Titanic$Sex)


Titanic$Embarked <- as.factor(Titanic$Embarked)


str(Titanic)


In [None]:
# SPLITTING TITANIC FULL DATA BACK INTO THE TEST AND TRAIN DATASET 

Titanic.train <- Titanic[Titanic$IsTrainset == TRUE, ]

Titanic.test <- Titanic[Titanic$IsTrainset == FALSE, ]

str(Titanic.train)
str(Titanic.test)

summary(Titanic.train)
summary(Titanic.test)


# GRAPHICAL VISUALIZATION AND EXPLORATION OF TRAIN AND TEST DATASETS

names(Titanic.train)

names(Titanic.test)

install.packages("ggplot2")

library(ggplot2)


barplot(table(Titanic.train$Sex), col=c("orange","lightgreen"), names= c("Female", "Male"), main="Breakdown by Gender")

barplot(table(Titanic.train$Embarked), col=c("green","blue", "orange"), names= c("Cherbourg", "Queenstown", "Southampton"), main="Port of Embarkation")

countsTable = table(Titanic$Sex, Titanic$Pclass)
barplot(countsTable,  col=c("Red","Green"), legend = c("Female", "Male"), 
        names= c("First", "Second", "Third"), main= "Breakdown of Passengers by Sex for each three Classes")


hist(Titanic$Age, main="Passenger Age Distribution", xlab = "Age")


countsTable = table(Titanic$Survived)
barplot(countsTable, col=c("red","green"), names= c("Perished", "Survived"), 
        legend = c("Perished", "Survived"), main="Breakdown of Titanic Passengers who Survived or Perished")


countsTable = table(Titanic$Survived, Titanic$Age)
barplot(countsTable,  col=c("red","green"), legend = c("Perished", "Survived"), main = "Passenger Fate by Age")


countsTable = table(Titanic$Survived, Titanic$Sex)
barplot(countsTable,  col=c("red","green"), legend = c("Perished", "Survived"), names= c("Female", "Male"), main = "Passenger fate by Sex")


mosaicplot(Titanic$Sex ~ Titanic$Survived, 
           main="Passenger fate by Sex", shade=FALSE, color=c("red","green"), 
           xlab="Sex", ylab="Survived")


mosaicplot(Titanic$Pclass ~ Titanic$Survived, 
           main="Passenger fate by Pclass", shade=FALSE, color=c("red","green"), 
           xlab="Pclass", ylab="Survived")


countsTable = table(Titanic$Survived, Titanic$Pclass)
barplot(countsTable,  col=c("red","green"), 
        legend = c("Perished", "Survived"), names= c("First", "Second", "Third"), main= "Passenger fate by Class")


# Additional Graphic Viualization and Confirmation of dataset



In [None]:
ggplot(Titanic.train, aes(Age, fill = Sex))+ 
  geom_histogram(binwidth = 5, colour = "black", position = "dodge",alpha=0.6)+ 
  scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, 5))+ 
  scale_y_continuous(limits = c(0, 200), breaks = seq(0, 200, 50))+ 
  ggtitle("Histogram to Diplay Distibution of the Age of the passengers") 



In [None]:
ggplot(Titanic.train, aes(Age, fill = Sex))+ 
  geom_histogram(binwidth = 5, colour = "black", position = "dodge",alpha=0.3)+ 
  scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, 5))+ 
  scale_y_continuous(limits = c(0, 200), breaks = seq(0, 200, 50))+ 
  theme_bw()+ 
  geom_vline(aes(xintercept=mean(Age)), 
             color="blue", linetype="dashed", size=1) 
ggtitle("Histogram to Diplay Distibution of the Age of the passengers") 


In [None]:
# Viualization and Confirmation of dataset

corrgram.traindata <- Titanic.train

corrgram.traindata$Survived <- as.numeric(corrgram.traindata$Survived)

corrgram.traindata$Pclass <- as.numeric(corrgram.traindata$Pclass)


install.packages("corrgram")

library(corrgram)

corrgram.traindata$Pclass



In [None]:
# Install Correlation Packages

install.packages("corrgram")

library(corrgram)

install.packages("Hmisc")

library(Hmisc)

rcorr(Titanic$Age, Titanic$Pclass) 

rcorr(Titanic$Survived, Titanic$Fare)

rcorr(Titanic$Survived, Titanic$Sex)


In [None]:
# Generate correlogram

install.packages("corrgram")

library(corrgram)

head(Titanic)

vars <- c("Survived", "Pclass", "Sex", "Age", "Fare", "Embarked")

names(Titanic.train)
names(Titanic.


In [None]:
# PAIRS for correlation matrix plot in between parameters in the titanic dataset.

pairs(Survived ~ Age+Pclass+SibSp+Parch+Fare, data=Titanic)

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

# SPLITTING BACK THE TITANIC DATASET INTO ORGINAL TEST AND TRAIN DATASETS

Titanic.train <- Titanic[Titanic$IsTrainset == TRUE, ]

Titanic.test <- Titanic[Titanic$IsTrainset == FALSE, ]

str(Titanic.test)

summary(Titanic.test)

str(Titanic.train)

summary(Titanic.train)


In [None]:
# LOGISTIC REGRESSION MODEL
# Cleaning and Input of Missing Age Attribute

table(is.na(Titanic$Age))

Median.Age <- median(Titanic$Age, na.rm = TRUE)

Titanic[is.na(Titanic$Age), "Age"] <- Median.Age

table(is.na(Titanic$Age))

# Cleaning and Input of Missing Fare Data

table(is.na(Titanic$Fare))

Median.Fare <- median(Titanic$Fare, na.rm = TRUE)

Titanic[is.na(Titanic$Fare), "Fare"] <- Median.Fare

colSums(is.na(Titanic)|Titanic=='') 

table(is.na(Titanic$Fare))


# Cleaning and Input of Missing Pclass Data

table(is.na(Titanic$Pclass))


table(is.na(Titanic$Pclass))

# Remove Outliers

boxplot(Titanic$Fare)

boxplot.stats(Titanic$Fare)

Upper.Whisker <- boxplot.stats(Titanic$Fare)$stats[5]

Outlier.filter <- Titanic$Fare < Upper.Whisker 

Titanic[Outlier.filter,]



In [None]:
# Predicting the validation set results


Fare.Equation <- "Fare ~ Pclass+Sex+Age+SibSp+Parch+Embarked"

Fare.Equation = "Fare ~ Pclass+Sex+Age+SibSp+Parch+Embarked"

Fare.Model <- lm(formula = Fare.Equation, data = Titanic[Outlier.filter, ])

Fare.Row <- Titanic[is.na(Titanic$Fare), c("Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked")]

Fare.Predictions <- predict(Fare.Model, newdata = Fare.Row)

Titanic[is.na(Titanic$Fare), "Fare"] <- Fare.Predictions
head(Titanic)
summary(Titanic)

# CATEGORICAL CASTING

Titanic$Pclass <- as.factor(Titanic$Pclass)

Titanic$Sex <- as.factor(Titanic$Sex)

Titanic$Embarked <- as.factor(Titanic$Embarked)

str(Titanic)

install.packages("randomForest")
library(randomForest)


In [None]:
# BUILDING PREDICTION MODELS 
# LOGISTIC REGRESSION MODEL ADDITIONS

Titanic.LogisticModel <- glm(Survived ~., family=binomial, data = Titanic.train)
                                          
summary(Titanic.LogisticModel)

Prob.Prediction <- predict(Titanic.LogisticModel, wdata=Titanic.test, type = 'response')
Prediction = ifelse(Prob.Prediction > 0.5, 1, 0)
Prediction
table(Prediction)

# CHECKING FOR PREDICTION ACCURACY
# Accuracy

Accuracy <- round(sum(diag(Prob.Prediction)) / sum(Prob.Prediction), 4)
head(Accuracy)

Missclassification.Error <- Accuracy
Missclassification.Error
head(Missclassification.Error)

# Using anova() to analyze the table of devaiance

anova(Titanic.LogisticModel, test="Chisq")
anova
# Confusion Matrix and Statistics

library(caret)


In [None]:
# LOGISTIC REGRESSION MODEL ADDITIONS
# Show the correlation of numeric features 

cor(Titanic.train[,unlist(lapply(Titanic.train,is.numeric))])


classifier = glm(Survived ~ Pclass + SibSp + Sex + Parch + Age + Embarked + Parch + Fare + FamilySize, family = binomial, data = Titanic.train) 

classifier <- step(classifier)

summary(classifier)


# Predicting the Validation set results 

Prob.Predictions = predict(classifier, type = 'response', data = Titanic.test) 

Predictions = ifelse(Prob.Predictions > 0.5, 1, 0) 
Predictions
summary(Predictions)
table(Predictions)


In [None]:
# TITANIC SURVIVAL USING THE DECISION TREE MODEL

str(Titanic.train)
str(Titanic.test)
table(Titanic.train$Survived)
prop.table(table(Titanic.train$Survived))
prop.table(table(Titanic.train$Sex,Titanic.train$Survived),1)

# Decision Tree for the Titanic 
# Fitting Decision Tree Classification Model to the Training set

classifier = glm(Survived ~ Pclass + SibSp + Sex + Parch + Age + Embarked + Parch + Fare + FamilySize, family = binomial, data = Titanic.train) 

classifier = rpart(Survived ~ Pclass + SibSp + Sex + Parch + Age + Embarked + Parch + Fare + FamilySize, data = Titanic.train, method = 'class')

# Tree Visualization

rpart.plot(classifier, extra=4)


plotcp(Tree.Model)

# Created a new column called child to indicate whether we have child or no child

Titanic.train$Child <- NA
Titanic.train$Child[Titanic.train$Age < 18] <- 1
Titanic.train$Child[Titanic.train$Age >= 18] <- 0

# Exploring Decision Trees for the Titanic.

install.packages("rpart")
library(rpart)

install.packages("rattle")
library(rattle)

install.packages("RColorBrewer")
library(RColorBrewer)


In [None]:
# BUILDING THE DECISION TREE MODEL FOR TITANIC

classifier = rpart(Survived ~ Pclass + SibSp + Sex + Parch + Age + Embarked + Parch + Fare, data = Titanic.train, method = 'class')
Titanic.DecisionTree <- rpart(Survived ~ Pclass + SibSp + Sex + Parch + Age + Embarked + Parch + Fare, data = Titanic.train, method = 'class')

# Fancy Decision Tree Plot

fancyRpartPlot(Titanic.DecisionTree, extra=4)
rpart.plot(classifier, extra=4)

# Making Titanic Survival Rate Predictions

Decisiontree.Predictions <- predict(Titanic.DecisionTree, newdata = Titanic.test, type = "class")

Decision.Solution <- data.frame(PassengerId = Titanic.test$PassengerId, Survived = Decisiontree.Predictions)

nrow(Decision.Solution)


Mydecisontree.Three <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, 
                     data = Titanic.train, method = "class", control = rpart.control(minsplit = 50, cp = 0))


# Fancy Decision Tree Plot



In [None]:
install.packages("randomForest")
library(randomForest)


# Fitting Random Forest Classification to the Training set 


str(Titanic.test)
str(Titanic.train)
set.seed(432) 

classifier = randomForest(Survived ~ Pclass + SibSp + Sex + Parch + Age + Embarked + Parch + Fare, data = Titanic.train, method = 'class')

plot(classifier)


# Predicting the Validation set results 

Predictions = predict(classifier, newdata = Titanic.test[,-which(names(Titanic.test)=="Survived")]) 
Predictions
tail(Predictions)
table(Predictions)


#


In [None]:
# RANDOM FOREST MODEL ADDITIONS

str(Titanic)
summary(Titanic)

# Since many passengers embarked at Southampton, we give them the value "S".

Titanic$Embarked[c(62, 830)] <- "S"

# Factorize embarkment codes.

Titanic$Embarked <- factor(Titanic$Embarked)

Titanic$Fare[1044] <- median(Titanic$Fare, na.rm = TRUE)

# Fill missing Age values

Age.Predictions <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + FamilySize,
                       data = Titanic[!is.na(Titanic$Age),], method = "anova")

summary(Age.Predictions)

Titanic$Age[is.na(Titanic$Age)] <- predict(Age.Predictions, Titanic[is.na(Titanic$Age),])

install.packages("randomForest")
library(randomForest)

# Titanic Train dataset and test dataset

str(Titanic.train)
str(Titanic.test)

# Set seed for processing the random forest model

set.seed(111)

# Apply the Random Forest Algorithm

# My Randomforest.Titanic <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data=Titanic.train, importance=TRUE,ntree=1000)

