In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
# import joblib
import os
from sklearn.metrics import accuracy_score # Added import

In [3]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
print("First 5 rows of the dataframe:")
print(df.head())
print("\nShape of the dataframe:")
print(df.shape)
print("\nInformation about the dataframe:")
print(df.info())
print("\nDescriptive statistics of the dataframe:")
print(df.describe().T)


print("\nMissing values in each column:")
print(df.isnull().sum())

First 5 rows of the dataframe:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            3734

In [None]:
print("\nEncoding 'Sex' column")
sex_encoder = LabelEncoder()
df["Sex"] = sex_encoder.fit_transform(df["Sex"])
print("Unique values in 'Sex' after encoding:", np.unique(df["Sex"]))

In [None]:
print("\n--- Encoding 'Embarked' column ---")
print("Filling missing values in 'Embarked' with 'S'")
df["Embarked"] = df["Embarked"].fillna("S")  # Fill NaN before encoding
embarked_encoder = LabelEncoder()
df["Embarked"] = embarked_encoder.fit_transform(df["Embarked"])
print("Unique values in 'Embarked' after encoding:", np.unique(df["Embarked"]))


In [None]:
print("\n--- Handling 'Age' and 'Fare' columns ---")
median_age = df["Age"].median()
median_fare = df["Fare"].median()
print(f"Median Age: {median_age}, Median Fare: {median_fare}")
df["Age"] = df["Age"].fillna(median_age)
df["Fare"] = df["Fare"].fillna(median_fare)
print("Missing values in 'Age' and 'Fare' after filling:", df[["Age", "Fare"]].isnull().sum())

In [None]:
features = ["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex", "Embarked"]
X = df[features]
y = df["Survived"]

In [None]:
print("\n Training the RandomForestClassifier model")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

In [None]:
print("\nTrained model:")
print(model)

In [None]:
print("\nFeature importances:")
print(model.feature_importances_)

print("\nEvaluating the model on the training data") #Added to show result on training data
predictions = model.predict(X)
accuracy = accuracy_score(y, predictions)
print(f"Training Accuracy: {accuracy:.4f}")