In [None]:
import numpy as np
import pandas as pd
import os   

In [None]:
paths = {}

for dirname, _, filenames in os.walk(os.getcwd()):
    for filename in filenames:
        full_path = os.path.join(dirname, filename)
        paths[filename] = full_path

for a,b in paths.items():
    print(f"{a} {b}")

In [None]:
for a,b in paths.items():
    print(f"{a} {b}")

In [None]:
train_data = pd.read_csv("train.csv")
train_data.head()

test_data = pd.read_csv("test.csv")
test_data.head()

submission = pd.read_csv("gender_submission.csv")

# see if jack and rose is on the dataset ehehehe
jack_rows = train_data[train_data["Name"].str.contains("jack", case=False, na=False)]
rose_rows = train_data[train_data["Name"].str.contains("rose", case=False, na=False)]

jack_rows[["PassengerId", "Name"]]
print()
rose_rows[["PassengerId", "Name"]]

In [None]:
print(train_data.columns)
test_data.head()

In [None]:
even_passengers = train_data[train_data['PassengerId'] % 2 == 0]
odd_passengers = train_data[train_data['PassengerId'] % 2 == 1]

total_even = len(even_passengers)
total_odd = len(odd_passengers)

print(f"Total passengers with EVEN PassengerId: {total_even}")
print(f"Total passengers with ODD PassengerId: {total_odd}")
print(f"Total passengers: {total_even + total_odd}")

In [None]:
# Data Exploration
print("Training Data Shape:", train_data.shape)
print("\nTraining Data Info:")
print(train_data.info())
print("\nMissing Values:")
print(train_data.isnull().sum())
print("\nSurvival Rate:")
print(train_data['Survived'].value_counts(normalize=True))

In [None]:
# Analyze survival by different features
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Survival by Sex
sns.countplot(data=train_data, x='Sex', hue='Survived', ax=axes[0, 0])
axes[0, 0].set_title('Survival by Sex')

# Survival by Pclass
sns.countplot(data=train_data, x='Pclass', hue='Survived', ax=axes[0, 1])
axes[0, 1].set_title('Survival by Passenger Class')

# Age distribution
train_data[train_data['Survived']==1]['Age'].hist(bins=30, alpha=0.5, label='Survived', ax=axes[1, 0])
train_data[train_data['Survived']==0]['Age'].hist(bins=30, alpha=0.5, label='Not Survived', ax=axes[1, 0])
axes[1, 0].set_title('Age Distribution by Survival')
axes[1, 0].legend()

# Survival by Embarked
sns.countplot(data=train_data, x='Embarked', hue='Survived', ax=axes[1, 1])
axes[1, 1].set_title('Survival by Embarked Port')

plt.tight_layout()
plt.show()

In [None]:
# Feature Engineering Function
def preprocess_data(df, is_train=True):
    # Create a copy to avoid modifying original data
    data = df.copy()
    
    # Fill missing Age with median
    data['Age'].fillna(data['Age'].median(), inplace=True)
    
    # Fill missing Embarked with mode
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    
    # Fill missing Fare with median (for test data)
    if 'Fare' in data.columns:
        data['Fare'].fillna(data['Fare'].median(), inplace=True)
    
    # Create new features
    # Family size
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    # IsAlone
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    
    # Extract Title from Name
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Simplify titles
    data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 
                                            'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    
    # Create Age bands
    data['AgeBand'] = pd.cut(data['Age'], bins=[0, 12, 18, 35, 60, 100], 
    labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
    
    # Create Fare bands
    data['FareBand'] = pd.qcut(data['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'], duplicates='drop')
    
    # Drop unnecessary columns
    data = data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    
    return data

# Preprocess both datasets
train_processed = preprocess_data(train_data, is_train=True)
test_processed = preprocess_data(test_data, is_train=False)

print("Processed Training Data:")
print(train_processed.head())
print("\nProcessed columns:", train_processed.columns.tolist())

In [None]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

# Combine train and test for consistent encoding
combine = [train_processed, test_processed]

# Encode categorical features
label_encoders = {}
categorical_features = ['Sex', 'Embarked', 'Title', 'AgeBand', 'FareBand']

for feature in categorical_features:
    le = LabelEncoder()
    # Fit on combined data to ensure consistent encoding
    all_values = pd.concat([train_processed[feature], test_processed[feature]]).astype(str)
    le.fit(all_values)
    
    train_processed[feature] = le.transform(train_processed[feature].astype(str))
    test_processed[feature] = le.transform(test_processed[feature].astype(str))
    
    label_encoders[feature] = le

print("Encoded Training Data:")
print(train_processed.head())
print("\nData types:")
print(train_processed.dtypes)

In [None]:
# Prepare data for modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Separate features and target
X = train_processed.drop(['Survived', 'PassengerId'], axis=1)
y = train_processed['Survived']

# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("\nFeatures used:")
print(X.columns.tolist())

In [None]:
# Train Random Forest Model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on validation set
y_pred = rf_model.predict(X_val)

# Evaluate
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Cross-validation score
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print(f"\nCross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature')
plt.title('Feature Importance in Random Forest Model')
plt.tight_layout()
plt.show()

In [None]:
# Train final model on all training data
final_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

final_model.fit(X, y)
print("Final model trained on all training data")

In [None]:
# Make predictions on test data
X_test = test_processed.drop(['PassengerId'], axis=1)
predictions = final_model.predict(X_test)

# Create submission file
submission_df = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})

print("Submission DataFrame:")
print(submission_df.head(10))
print(f"\nTotal predictions: {len(submission_df)}")
print(f"Predicted survivors: {predictions.sum()}")
print(f"Predicted non-survivors: {len(predictions) - predictions.sum()}")

In [None]:
# Save submission file
submission_df.to_csv('titanic_submission.csv', index=False)
print("Submission file saved as 'titanic_submission.csv'")
print("\nTo submit:")
print("1. Go to https://www.kaggle.com/c/titanic/submit")
print("2. Upload the 'titanic_submission.csv' file")
print("3. Add a description and submit!")