In [3]:
import pandas as pd

# Load the train and test datasets
df_train = '../data/train.csv'
df_test = '../data/test.csv'

# Load the datasets
train_data = pd.read_csv(df_train)
test_data = pd.read_csv(df_test)

# Display the first few rows of the datasets to understand their structure
train_data.head(), test_data.head()

(   PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500

In [4]:
# Data Preprocessing

# 1. Handle missing values
# Fill missing 'Age' with median, 'Fare' with median, and 'Embarked' with the mode
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

train_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' since it has many missing values
train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

# 2. Convert categorical features into numerical ones
# Convert 'Sex' and 'Embarked' using one-hot encoding (pd.get_dummies)
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'], drop_first=True)

# 3. Drop unnecessary columns
# Drop 'Name' and 'Ticket' as they are non-numeric and less relevant for the prediction
train_data.drop(['Name', 'Ticket'], axis=1, inplace=True)
test_data.drop(['Name', 'Ticket'], axis=1, inplace=True)

# Prepare train and test data for ML
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
X_test = test_data

# Show the processed datasets
X_train.head(), X_test.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

(   PassengerId  Pclass   Age  SibSp  Parch     Fare  Sex_male  Embarked_Q  \
 0            1       3  22.0      1      0   7.2500      True       False   
 1            2       1  38.0      1      0  71.2833     False       False   
 2            3       3  26.0      0      0   7.9250     False       False   
 3            4       1  35.0      1      0  53.1000     False       False   
 4            5       3  35.0      0      0   8.0500      True       False   
 
    Embarked_S  
 0        True  
 1       False  
 2        True  
 3        True  
 4        True  ,
    PassengerId  Pclass   Age  SibSp  Parch     Fare  Sex_male  Embarked_Q  \
 0          892       3  34.5      0      0   7.8292      True        True   
 1          893       3  47.0      1      0   7.0000     False       False   
 2          894       2  62.0      0      0   9.6875      True        True   
 3          895       3  27.0      0      0   8.6625      True       False   
 4          896       3  22.0      1 

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Split the training data into a train and validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)

# Train the model
logreg.fit(X_train_split, y_train_split)

# Make predictions on the validation set
y_pred_val = logreg.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred_val)
conf_matrix = confusion_matrix(y_val, y_pred_val)

accuracy, conf_matrix

(0.8044692737430168,
 array([[89, 16],
        [19, 55]]))

In [6]:
# Make predictions on the test dataset
y_test_pred = logreg.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': X_test['PassengerId'],
    'Survived': y_test_pred
})

# Display the first few rows of the submission file
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [7]:
submission.to_csv ('submission.csv',index=False)