####  Data Acquisition and Exploration:

Download the Titanic dataset from Kaggle.

Explore the data using pandas to understand its structure, identify missing values, and analyze feature distributions.

In [2]:
import pandas as pd

titanic_data = pd.read_csv('tested.csv')
titanic_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,0,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,1,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,0,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,1,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,0,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [3]:
age_mean = titanic_data['Age'].mean()
fare_mean = titanic_data['Fare'].mean()
titanic_data['Age'].fillna(age_mean, inplace = True)
titanic_data['Fare'].fillna(fare_mean, inplace = True)


In [4]:
titanic_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,12.634534,0.89676,0.981429,55.8405
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,23.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,30.27259,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,35.75,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [5]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          418 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         418 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


#### Feature Engineering:

Go beyond basic features. Explore feature creation techniques:

Derive new features from existing ones (e.g., family size based on siblings/spouses).

Encode categorical features (e.g., sex, embarked port) into numerical representations suitable for machine learning models.

Consider feature scaling or normalization to improve model performance.

In [6]:
# Derive new features from existing ones (e.g., family size based on siblings/spouses).
titanic_data = titanic_data.assign(Family_Size = titanic_data['SibSp'] + titanic_data['Parch'])
titanic_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,2
5,897,0,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S,0
6,898,1,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,0
7,899,0,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S,2
8,900,1,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,0
9,901,0,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,2


In [7]:
# Encode categorical features (e.g., sex, embarked port) into numerical representations suitable for machine learning models.
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1, '': 0})
titanic_data['Sex']

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Sex, Length: 418, dtype: int64

In [8]:
titanic_data['Embarked'] = titanic_data['Embarked'].map({'S': 1, 'Q': 2, 'C': 3})
titanic_data['Embarked']

0      2
1      1
2      2
3      1
4      1
      ..
413    1
414    3
415    1
416    1
417    3
Name: Embarked, Length: 418, dtype: int64

In [9]:
 titanic_data.drop(['Cabin','Embarked'], axis=1, inplace = True)

In [10]:
titanic_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Family_Size
0,892,0,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,0
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,1
2,894,0,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,0
3,895,0,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,0
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,2
5,897,0,3,"Svensson, Mr. Johan Cervin",0,14.0,0,0,7538,9.225,0
6,898,1,3,"Connolly, Miss. Kate",1,30.0,0,0,330972,7.6292,0
7,899,0,2,"Caldwell, Mr. Albert Francis",0,26.0,1,1,248738,29.0,2
8,900,1,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",1,18.0,0,0,2657,7.2292,0
9,901,0,3,"Davies, Mr. John Samuel",0,21.0,2,0,A/4 48871,24.15,2


#### Model Selection and Training:

Choose a classical machine learning model of your choice (e.g., Logistic Regression, Decision Tree, Random Forest).

Split the data into training and testing sets for unbiased evaluation.

Train the model on the training set, experimenting with different hyperparameters using techniques like GridSearchCV or RandomizedSearchCV.

In [38]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression


# Prepare data for model
X = titanic_data.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1)
y = titanic_data['Survived']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
reg = LogisticRegression()
reg.fit(X_train, y_train)


### Hyperparameter Tuning:

Explore various hyperparameters specific to your chosen model.

[OPTIONAL] Utilize GridSearchCV or RandomizedSearchCV to efficiently evaluate different hyperparameter combinations and identify the optimal configuration for your model.

In [59]:
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 500, 1000]
}



In [60]:
grid_search = GridSearchCV(reg, param_grid, n_jobs = -1, cv = 10, scoring = 'accuracy')
grid_search.fit(X_train, y_train)


90 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' o

In [61]:
# print the best hyperparameters and the corresponding accuracy
print('Best Hyperparameters: ', grid_search.best_params_)

Best Hyperparameters:  {'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}


In [62]:
print('Best Accuracy: ', grid_search.best_score_)

Best Accuracy:  1.0


In [63]:
y_pred = reg.predict(X_test)

### Evaluation and Comparison:

Evaluate your model's performance on the testing set using metrics like:

Accuracy: Proportion of correct predictions.

F1-score: Harmonic mean of precision and recall.

Precision: Ratio of true positives to all predicted positives.

Recall: Ratio of true positives to all actual positives.

In [64]:
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score

In [65]:
Accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", Accuracy)

Accuracy:  1.0


In [66]:
F1_Score = f1_score(y_test, y_pred)
print("F1 Score: ", F1_Score)

F1 Score:  1.0


In [67]:
Precision = precision_score(y_test, y_pred)
print("Precision: ", Precision)

Precision:  1.0


In [68]:
Recall = recall_score(y_test, y_pred)
print("Recall: ", Recall)

Recall:  1.0


In [71]:
survival_prediction = grid_search.predict(X_test)
survival_prediction

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1], dtype=int64)