In [37]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
# import joblib
import os
from sklearn.metrics import accuracy_score # Added import

In [38]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [41]:
print("First 5 rows of the dataframe:")
print(df.head())
print("\nShape of the dataframe:")
print(df.shape)
print("\nInformation about the dataframe:")
print(df.info())
print("\nDescriptive statistics of the dataframe:")
print(df.describe().T)


print("\nMissing values in each column:")
print(df.isnull().sum())

First 5 rows of the dataframe:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    1  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1      0   
2                             Heikkinen, Miss. Laina    0  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1      0   
4                           Allen, Mr. William Henry    1  35.0      0      0   

             Ticket     Fare Cabin Embarked  
0         A/5 21171   7.2500   NaN        S  
1          PC 17599  71.2833   C85        C  
2  STON/O2. 3101282   7.9250   NaN        S  
3            113803  53.1000  C123        S  
4            373450   8.0500   NaN 

In [42]:
print("\nEncoding 'Sex' column")
sex_encoder = LabelEncoder()
df["Sex"] = sex_encoder.fit_transform(df["Sex"])
print("Unique values in 'Sex' after encoding:", np.unique(df["Sex"]))


Encoding 'Sex' column
Unique values in 'Sex' after encoding: [0 1]


In [43]:
print("\n--- Encoding 'Embarked' column ---")
print("Filling missing values in 'Embarked' with 'S'")
df["Embarked"] = df["Embarked"].fillna("S")  # Fill NaN before encoding
embarked_encoder = LabelEncoder()
df["Embarked"] = embarked_encoder.fit_transform(df["Embarked"])
print("Unique values in 'Embarked' after encoding:", np.unique(df["Embarked"]))



--- Encoding 'Embarked' column ---
Filling missing values in 'Embarked' with 'S'
Unique values in 'Embarked' after encoding: [0 1 2]


In [44]:
print("\n--- Handling 'Age' and 'Fare' columns ---")
median_age = df["Age"].median()
median_fare = df["Fare"].median()
print(f"Median Age: {median_age}, Median Fare: {median_fare}")
df["Age"] = df["Age"].fillna(median_age)
df["Fare"] = df["Fare"].fillna(median_fare)
print("Missing values in 'Age' and 'Fare' after filling:", df[["Age", "Fare"]].isnull().sum())


--- Handling 'Age' and 'Fare' columns ---
Median Age: 28.0, Median Fare: 14.4542
Missing values in 'Age' and 'Fare' after filling: Age     0
Fare    0
dtype: int64


In [45]:
features = ["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex", "Embarked"]
X = df[features]
y = df["Survived"]

In [46]:
print("\n Training the RandomForestClassifier model")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)


 Training the RandomForestClassifier model


In [47]:
print("\nTrained model:")
print(model)


Trained model:
RandomForestClassifier(random_state=42)


In [48]:
print("\nFeature importances:")
print(model.feature_importances_)

print("\nEvaluating the model on the training data") 
predictions = model.predict(X)
accuracy = accuracy_score(y, predictions)
print(f"Training Accuracy: {accuracy:.4f}")


Feature importances:
[0.08809061 0.26026293 0.04841134 0.03986776 0.26960221 0.26109121
 0.03267392]

Evaluating the model on the training data
Training Accuracy: 0.9798


In [56]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


X = df[features]
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


test_preds = model.predict(X_test)


test_acc = accuracy_score(y_test, test_preds)
print(f"Test Accuracy: {test_acc:.4f}")


Test Accuracy: 0.8156


In [57]:
from sklearn.metrics import precision_score, recall_score, f1_score


precision = precision_score(y_test, test_preds)
recall = recall_score(y_test, test_preds)
f1 = f1_score(y_test, test_preds)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Precision: 0.7887
Recall: 0.7568
F1-Score: 0.7724


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load test data and labels
test_features = pd.read_csv("test.csv")
test_labels = pd.read_csv("gender_submission.csv")  # contains 'PassengerId' and 'Survived'

# Drop unused columns not part of features
test_features = test_features.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Handle missing values
test_features['Age'] = test_features['Age'].fillna(test_features['Age'].median())
test_features['Fare'] = test_features['Fare'].fillna(test_features['Fare'].median())
test_features['Embarked'] = test_features['Embarked'].fillna(test_features['Embarked'].mode()[0])



# Encode categorical variables using the fitted encoders
test_features['Sex'] = sex_encoder.transform(test_features['Sex'])
test_features['Embarked'] = embarked_encoder.transform(test_features['Embarked'])

# Use the exact feature columns in the correct order as during training
features = ["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex", "Embarked"]
test_features = test_features[features]

# Predict on test data
predictions = model.predict(test_features)

# Calculate accuracy
accuracy = accuracy_score(test_labels['Survived'], predictions)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.8421052631578947
