In [None]:
"""
End-to-End Titanic Project
––––––––––––––––––––––––––
Loads train/test data
Exploratory prints (head, info, null counts)
Cleans / preprocesses
Trains a Random-Forest model
Reports validation accuracy
Generates kaggle-ready submission file
"""

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
train = pd.read_csv(r"C:\Users\USER\Downloads\titanic\train.csv")
test  = pd.read_csv(r"C:\Users\USER\Downloads\titanic\test.csv")

print("--TRAIN HEAD-- "); print(train.head(), "\n")
print("--TEST HEAD--"); print(test.head(),  "\n")

print("--TRAIN INFO--"); print(train.info(), "\n")
print("--TEST INFO--"); print(test.info(),  "\n")

print("--TRAIN NULL COUNTS--"); print(train.isna().sum(), "\n")
print("--TEST NULL COUNTS--");  print(test.isna().sum(),  "\n")

--TRAIN HEAD-- 
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   N

In [8]:
# Combine for uniform preprocessing
train["TrainSplit"] = True
test ["TrainSplit"] = False
test ["Survived"]   = None          
full = pd.concat([train, test], sort=False)

full["Age"] = full["Age"].fillna(full["Age"].median())
full["Fare"] = full["Fare"].fillna(full["Fare"].median())
full["Embarked"] = full["Embarked"].fillna(full["Embarked"].mode()[0])

In [9]:
# Drop high-null 
full.drop(columns=["Cabin", "Ticket", "Name"], inplace=True)

In [10]:
# Encode categoricals
full["Sex"]      = LabelEncoder().fit_transform(full["Sex"])
full["Embarked"] = LabelEncoder().fit_transform(full["Embarked"])
print("=== CLEANED HEAD ==="); print(full.head(), "\n")

=== CLEANED HEAD ===
   PassengerId Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  \
0            1        0       3    1  22.0      1      0   7.2500         2   
1            2        1       1    0  38.0      1      0  71.2833         0   
2            3        1       3    0  26.0      0      0   7.9250         2   
3            4        1       1    0  35.0      1      0  53.1000         2   
4            5        0       3    1  35.0      0      0   8.0500         2   

   TrainSplit  
0        True  
1        True  
2        True  
3        True  
4        True   



In [11]:
# Split back
clean_train = full[full["TrainSplit"]].drop(columns="TrainSplit")
clean_test  = full[~full["TrainSplit"]].drop(columns=["TrainSplit", "Survived"])

In [12]:
# Train / validation split
X = clean_train.drop(columns=["Survived", "PassengerId"])
y = clean_train["Survived"].astype(int)
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Model training
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_tr, y_tr)

# Validation
val_pred = model.predict(X_val)
acc = accuracy_score(y_val, val_pred)
print(f"Validation Accuracy: {acc:.4%}")

# Predict on test set
test_features = clean_test.drop(columns=["PassengerId"])
test_pred = model.predict(test_features)

Validation Accuracy: 81.0056%


In [17]:
# Save cleaned data & submission
clean_train.to_csv(r"C:\Users\USER\Downloads\titanic\cleaned\cleaned_train.csv", index=False)
clean_test .to_csv(r"C:\Users\USER\Downloads\titanic\cleaned\cleaned_test.csv" , index=False)

submission = pd.DataFrame({
    "PassengerId": clean_test["PassengerId"],
    "Survived":    test_pred.astype(int)
})
submission.to_csv(r"C:\Users\USER\Downloads\titanic\cleaned\titanic_submission.csv", index=False)

print("\nFiles generated:")
print("• cleaned_train.csv")
print("• cleaned_test.csv")
print("• titanic_submission.csv ")



Files generated:
• cleaned_train.csv
• cleaned_test.csv
• titanic_submission.csv 
