<a href="https://colab.research.google.com/github/Slimani-CE/titanic_machine_learning/blob/main/Predict_survival_on_the_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

## Importing the Dataset

In [None]:
dataset = pd.read_csv('train.csv')
testset = pd.read_csv('test.csv')
test_ids = testset['PassengerId']

In [None]:
testset.info()

In [None]:
dataset

In [None]:
dataset.shape

## Data analyse

In [None]:
# Categorical & Numerical columns
all_cols = dataset.columns.tolist()
cat_cols = dataset.select_dtypes(include = ['object']).columns.tolist()
num_cols = [col for col in all_cols if col not in cat_cols]
print(f'All columns        : {all_cols}')
print(f'Categorical columns: {cat_cols}')
print(f'Numerical Columns  : {num_cols}')

In [None]:
# sb.pairplot(data = dataset, hue = 'Survived')

In [None]:
# sb.heatmap(dataset.corr() ,annot = True)

### Number of classes in each categorical column

In [None]:
# Number of available values and Number of classes in each column
for col in all_cols:
  all_col_count = dataset[col].shape[0]
  available_count = dataset[col].count()
  classes_number = len(dataset[col].unique())
  print(f'Column name : {col:15} | Column length: {all_col_count} | Available values : {available_count} | Number of classes : {classes_number - (1 if all_col_count - available_count != 0 else 0)}')

In [None]:
# Number of values of each category in 'Embarked' column
dataset['Parch'].value_counts()

In [None]:
cols_to_drop = ['Name', 'PassengerId', 'Ticket', 'Cabin', 'Embarked']

## Data preprocessing

### Drop columns

In [None]:
dataset = dataset.drop(columns = cols_to_drop, axis = 1)
testset = testset.drop(columns = cols_to_drop, axis = 1)

In [None]:
testset.info()

### Taking care of missing values

In [None]:
dataset.info()

In [None]:
# Drop the 'Age' rows with missing values
dataset.dropna(inplace = True)

mean = (dataset['Age'].mean() + testset['Age'].mean()) / 2
max_occ = 

# The same for the testset
# mean = testset['Age'].mean()
testset['Age'] = testset['Age'].fillna(mean)
testset['Fare'] = testset['Fare'].fillna(testset['Fare'].mean())

In [None]:
dataset.info()

In [None]:
testset.info()

### Encoding categorical values

In [None]:
dataset = pd.get_dummies(data = dataset)
testset = pd.get_dummies(data = testset)

### Change Pclass values 1 -> 3, 3 -> 1

In [None]:
dataset['Pclass'].replace({1: 3, 3: 1}, inplace = True)
testset['Pclass'].replace({1: 3, 3: 1}, inplace = True)

### Split feature matrix and target column

In [None]:
X = dataset.drop(columns = ['Survived'], axis = 1)
y = dataset['Survived']

## Logistic regression

### Splitting data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

### Feature scaling

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Training the Logistic Regression model on the Training set

In [None]:
classifier = LogisticRegression(random_state = 42)
classifier.fit(X_train, y_train)

### Evaluate the model on the test set

making predections

In [None]:
y_pred = classifier.predict(X_test)

Making the Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

Calculating the score

In [None]:
score = accuracy_score(y_pred, y_test)
print(f'score : {score}')

### Export predictions

Train the model in the whole

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X)

classifier = LogisticRegression(random_state = 42)
classifier.fit(X_train, y)

In [None]:
submission_preds = classifier.predict(scaler.transform(testset))

In [None]:
df = pd.DataFrame({"PassengerId": test_ids.values, "Survived": submission_preds})

In [None]:
df.to_csv("sub_preds_logistic.csv", index = False)

## RandomForest

### Splitting data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Feature scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training the model

In [None]:
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

### Evaluate the model on the test set

making predections

In [None]:
y_pred = classifier.predict(X_test)

Making the Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

Calculating the score

In [None]:
score = accuracy_score(y_pred, y_test)
print(f'score : {score}')

### Export predictions

Train the model in the whole dataset

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X)

classifier = LogisticRegression(random_state = 42)
classifier.fit(X_train, y)

In [None]:
submission_preds = classifier.predict(scaler.transform(testset))

In [None]:
df = pd.DataFrame({"PassengerId": test_ids.values, "Survived": submission_preds})

In [None]:
df.to_csv("sub_preds_random_forest.csv", index = False)