<a href="https://colab.research.google.com/github/Slimani-CE/titanic_machine_learning/blob/main/Predict_survival_on_the_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Slimani-CE/titanic_machine_learning

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

## Importing the Dataset

In [None]:
dataset = pd.read_csv('train.csv')
testset = pd.read_csv('test.csv')
test_ids = testset['PassengerId']

In [None]:
testset.info()

In [None]:
dataset

In [None]:
dataset.shape

## Data analyse

In [None]:
# Categorical & Numerical columns
all_cols = dataset.columns.tolist()
cat_cols = dataset.select_dtypes(include = ['object']).columns.tolist()
num_cols = [col for col in all_cols if col not in cat_cols]
print(f'All columns        : {all_cols}')
print(f'Categorical columns: {cat_cols}')
print(f'Numerical Columns  : {num_cols}')

In [None]:
# sb.pairplot(data = dataset, hue = 'Survived')

In [None]:
# sb.heatmap(dataset.corr() ,annot = True)

### Number of classes in each categorical column

In [None]:
# Number of available values and Number of classes in each column
for col in all_cols:
  all_col_count = dataset[col].shape[0]
  available_count = dataset[col].count()
  classes_number = len(dataset[col].unique())
  print(f'Column name : {col:15} | Column length: {all_col_count} | Available values : {available_count} | Number of classes : {classes_number - (1 if all_col_count - available_count != 0 else 0)}')

In [None]:
# Number of values of each category in 'Embarked' column
dataset['Parch'].value_counts()

In [None]:
cols_to_drop = ['Name', 'PassengerId', 'Ticket', 'Cabin', 'Embarked']

## Data preprocessing

### Drop columns

In [None]:
print(f'Columns to drop: {cols_to_drop}')

In [None]:
dataset = dataset.drop(columns = cols_to_drop, axis = 1)
testset = testset.drop(columns = cols_to_drop, axis = 1)

### Taking care of missing values

In [None]:
dataset.info()

In [None]:
testset.info()

In [None]:
# Drop the 'Age' rows with missing values
# dataset.dropna(inplace = True)

mean = (dataset['Age'].mean() + testset['Age'].mean()) / 2
most_repeated_values = dataset['Age'].value_counts().index.tolist()[:2]

# The same for the testset
# mean = testset['Age'].mean()
# te.stset['Age'] = testset['Age'].fillna(mean)
# testset['Fare'] = testset['Fare'].fillna(testset['Fare'].mean())

dataset.interpolate(method = 'linear', inplace = True)
testset.interpolate(method = 'linear', inplace = True)

In [None]:
most_repeated_values

In [None]:
dataset.info()

In [None]:
testset.info()

### Encoding categorical values

In [None]:

dataset = pd.get_dummies(data = dataset)
testset = pd.get_dummies(data = testset)

In [None]:
dataset.head()

### Split feature matrix and target column

In [None]:
X = dataset.drop(columns = ['Survived'], axis = 1)
y = dataset['Survived']

## Logistic regression

### Splitting data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

### Feature scaling

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Training the Logistic Regression model on the Training set

In [None]:
classifier = LogisticRegression(random_state = 42)
classifier.fit(X_train, y_train)

### Evaluate the model on the test set

making predections

In [None]:
y_pred = classifier.predict(X_test)

Making the Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

Calculating the score

In [None]:
score = accuracy_score(y_pred, y_test)
print(f'score : {score}')

### Export predictions

Train the model in the whole

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X)

classifier = LogisticRegression(random_state = 42)
classifier.fit(X_train, y)

In [None]:
submission_preds = classifier.predict(scaler.transform(testset))

In [None]:
df = pd.DataFrame({"PassengerId": test_ids.values, "Survived": submission_preds})

In [None]:
df.to_csv("sub_preds_logistic.csv", index = False)

## RandomForest

### Splitting data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

### Feature scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Calculate the best parameters for Random Forest Classifer

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [3],
    'min_samples_leaf': [1, 2, 3]
}

# Create a random forest classifier object
rf = RandomForestClassifier()

# Create a grid search object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

### Training the model

In [None]:
classifier = RandomForestClassifier(max_depth = None, min_samples_leaf = 2, min_samples_split = 3, n_estimators = 30, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

### Evaluate the model on the test set

making predections

In [None]:
y_pred = classifier.predict(X_test)

Making the Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

Calculating the score

In [None]:
score = accuracy_score(y_test, y_pred)
print(f'score : {score}')

### Export predictions

Train the model in the whole dataset

In [None]:
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 42)
classifier.fit(X, y)

Predict submission test

In [None]:
submission_preds = classifier.predict(scaler.transform(testset))

Create submission test dataframe

In [None]:
df = pd.DataFrame({"PassengerId": test_ids.values, "Survived": submission_preds})

Export submission test file

In [None]:
df.to_csv("sub_preds_random_forest.csv", index = False)

## SVM

### Splitting data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

### Feature scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training the model

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 42)
classifier.fit(X_train, y_train)

### Evaluate the model on the test set

making predections

In [None]:
y_pred = classifier.predict(X_test)

Making the Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

Calculating the score

In [None]:
score = accuracy_score(y_test, y_pred)
print(f'score : {score}')

### Export predictions

Train the model in the whole dataset

In [None]:
classifier = SVC(kernel = 'rbf', random_state = 42)
classifier.fit(X, y)

Predict submission test

In [None]:
submission_preds = classifier.predict(scaler.transform(testset))

Create submission test dataframe

In [None]:
df = pd.DataFrame({"PassengerId": test_ids.values, "Survived": submission_preds})

Export submission test file

In [None]:
df.to_csv("sub_preds_svc.csv", index = False)

## XGBoost

### Splitting data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

### Training the model

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

### Evaluate the model on the test set

making predections

In [None]:
y_pred = classifier.predict(X_test)

Making the Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

Calculating the score

In [None]:
score = accuracy_score(y_test, y_pred)
print(f'score : {score}')

### Export predictions

Train the model in the whole dataset

In [None]:
classifier = XGBClassifier()
classifier.fit(X, y)

Predict submission test

In [None]:
submission_preds = classifier.predict(testset)

Create submission test dataframe

In [None]:
df = pd.DataFrame({"PassengerId": test_ids.values, "Survived": submission_preds})

Export submission test file

In [None]:
df.to_csv("sub_preds_xgboost.csv", index = False)

## Neural Networks

### Splitting data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

### Feature scaling

In [None]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

### Training the model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
model = Sequential([
    Dense(units = 6, activation = 'sigmoid'),
    Dense(units = 3, activation = 'sigmoid'),
    Dense(units = 1 , activation = 'linear')
])

model.compile(optimizer = Adam(learning_rate = 1e-1), loss = BinaryCrossentropy(from_logits = True))

epochs = 1500
batch_size = 50

model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size)

### Evaluate the model on the test set

making predections

In [None]:
y_pred = tf.nn.sigmoid(model(X_test)) > 0.5

Making the Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

Calculating the score

In [None]:
score = accuracy_score(y_test, y_pred)
print(f'score : {score}')

### Export predictions

Train the model in the whole dataset

In [None]:
model.fit(X, y, epochs = epochs, batch_size = batch_size)

Predict submission test

In [None]:
submission_preds = model.predict(testset) > 0.5
submission_preds = submission_preds.reshape(-1,)
submission_preds = [1 if pred else 0 for pred in submission_preds]

Create submission test dataframe

In [None]:
df = pd.DataFrame({"PassengerId": test_ids.values, "Survived": submission_preds})

Export submission test file

In [None]:
df.to_csv("sub_preds_neural_networks.csv", index = False)