Step 1: Upload CSV File to Colab

In [1]:
from google.colab import files
uploaded = files.upload()


Saving tested.csv to tested.csv


 Step 2: Load and Preview the Data

In [2]:
import pandas as pd

# Load the uploaded CSV
df = pd.read_csv('tested.csv')

# Preview data
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Step 3: Data Preprocessing

In [23]:
# Check available columns first to ensure no mismatches
print("Available columns:", df.columns)

# Drop unnecessary columns (PassengerId, Name, Ticket, Cabin) – These columns do not exist
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

# Handle missing values in 'Age' and 'Fare' (fill with median)
df['Age'] = df['Age'].fillna(df['Age'].median())  # Avoid inplace warning
df['Fare'] = df['Fare'].fillna(df['Fare'].median())  # Avoid inplace warning

# Fill missing values in 'Embarked_Q' and 'Embarked_S' with the mode (most frequent value)
df['Embarked_Q'] = df['Embarked_Q'].fillna(df['Embarked_Q'].mode()[0])
df['Embarked_S'] = df['Embarked_S'].fillna(df['Embarked_S'].mode()[0])

# Normalize 'Age' and 'Fare' using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

# Final fill for any remaining NaNs (just in case)
df.fillna(0, inplace=True)

# Display the cleaned dataframe
print("Cleaned DataFrame:")
df.head()


Available columns: Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')
Cleaned DataFrame:


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,0.386231,0,0,-0.497413,True,True,False
1,1,3,1.37137,1,0,-0.512278,False,False,True
2,0,2,2.553537,0,0,-0.4641,True,True,False
3,0,3,-0.204852,0,0,-0.482475,True,False,True
4,1,3,-0.598908,1,1,-0.417492,False,False,True


Step 4: Define Features & Target

In [24]:
# Define features (X) and target variable (y)
X = df.drop('Survived', axis=1)  # Drop 'Survived' column from the features
y = df['Survived']  # Target variable is 'Survived'


 Step 5: Train-Test Split

In [25]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm no missing values in the training data
print("Missing values in X_train:", X_train.isnull().sum().sum())


Missing values in X_train: 0


Step 6: Train Model (Logistic Regression example)

In [26]:
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model
model = LogisticRegression(max_iter=1000)

# Train the model on the training data
model.fit(X_train, y_train)


Step 7: Evaluate Model

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("✅ Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Model Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Confusion Matrix:
 [[50  0]
 [ 0 34]]
