# Titanic DataAnalysis

In [None]:
import pandas as pd

# Function to load data
def load_data(file_path):

    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None


file_path = "/Users/pratigyajamakatel/Downloads/Titanic-Dataset.csv"
df = load_data(file_path)

# Check if data is loaded
if df is not None:
    print(df.head())


# Display Basic info

In [None]:
def basic_info(df):
    print("\nDataset Info:")
    print(df.info())


# If data loaded successfully, display basic info
if df is not None:
    basic_info(df)


# Checking Missing Values

In [None]:
def missing_values(df):
    print("\n Missing Values:")
    print(df.isnull().sum())

# Check missing values in the dataset
if df is not None:
    missing_values(df)


# Fill missing values

In [None]:
def handle_missing_values(df):
    df["Age"].fillna(df["Age"].median())
    df["Embarked"].fillna(df["Embarked"].mode()[0])
    df["Cabin"].fillna("Unknown")
    return df

# Handle missing values
if df is not None:
    df = handle_missing_values(df)
    print("\n Missing values handled!")


# Statistical Summary

In [None]:
def summary_statistics(df):
    print("\n Summary Statistics:")
    print(df.describe())

# Display statistical summary
if df is not None:
    summary_statistics(df)


# Visualizing missing data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_missing_data(df):
    plt.figure(figsize=(10, 5))
    sns.heatmap(df.isnull(), cmap="viridis", cbar=False, yticklabels=False)
    plt.title("Missing Data Heatmap")
    plt.show()

# Visualize missing data
if df is not None:
    visualize_missing_data(df)


# Data Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_missing_data(df):
    plt.figure(figsize=(10, 5))
    sns.heatmap(df.isnull(), cmap="viridis", cbar=False, yticklabels=False)
    plt.title("Missing Data Heatmap")
    plt.show()

# Visualize missing data
if df is not None:
    visualize_missing_data(df)


# Survival vs NotSurvioval

In [None]:
# Define the survival_count function correctly
def survival_count(df):
    plt.figure(figsize=(6, 4))
    sns.countplot(x="Survived",hue="Sex",data=df, palette="Set2")
    plt.title("Survival Count")
    plt.xlabel("Survived (0 = No, 1 = Yes)")
    plt.ylabel("Count")
    plt.show()

# Visualize survival count
if df is not None:
    survival_count(df)



# Survival Rate By Gender

In [None]:
def survival_by_gender(df):
    plt.figure(figsize=(6, 4))
    sns.countplot(x="Sex", hue="Survived", data=df, palette="Set1")
    plt.title("Survival by Gender")
    plt.xlabel("Gender")
    plt.ylabel("Count")
    plt.legend(["Not Survived", "Survived"])
    plt.show()

# Visualize survival rate by gender
if df is not None:
    survival_by_gender(df)


# Survival Rate by Passenger Class

In [None]:
def survival_by_class(df):
    plt.figure(figsize=(6, 4))
    sns.countplot(x="Pclass", hue="Survived", data=df, palette="coolwarm")
    plt.title("Survival by Passenger Class")
    plt.xlabel("Passenger Class")
    plt.ylabel("Count")
    plt.legend(["Not Survived", "Survived"])
    plt.show()

# Visualize survival rate by passenger class
if df is not None:
    survival_by_class(df)


# Correlation Heatmap

In [None]:
# Define the correlation_heatmap function
def correlation_heatmap(df):
    # Filter numeric columns only
    df_numeric = df.select_dtypes(include=[np.number])  # Keep only numeric columns

    # Plotting the correlation heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
    plt.title("Feature Correlation Heatmap")
    plt.show()


# Display correlation heatmap if df is not None
if df is not None:
    correlation_heatmap(df)


# Feature Engineering

In [None]:
def feature_engineering(df):
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    print("\n Feature 'FamilySize' added!")
    return df

# Add family size feature
if df is not None:
    df = feature_engineering(df)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Engineering: Creating new column "FamilySize
def feature_engineering(df):
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    print("\n Feature 'FamilySize' added!")
    return df

# Visualize FamilySize distribution
def visualize_family_size(df):
    plt.figure(figsize=(12, 6))
    sns.histplot(df["FamilySize"], bins=20, kde=True, color='purple')
    plt.title("Family Size Distribution")
    plt.xlabel("Family Size")
    plt.ylabel("Count")
    plt.show()

# Run feature engineering and visualization
if df is not None:
    df = feature_engineering(df)  # Perform feature engineering
    visualize_family_size(df)    # Visualize Family Size Distribution


# Age Distribution

In [None]:
def visualize_age_distribution(df):
    plt.figure(figsize=(12, 6))
    sns.histplot(df["Age"], bins=30, kde=True, color='green')
    plt.title("Age Distribution")
    plt.xlabel("Age")
    plt.ylabel("Count")
    plt.show()

# Run the age distribution visualization
if df is not None:
    visualize_age_distribution(df)


# Survival vs Fare

In [None]:
def survival_by_fare(df):
    plt.figure(figsize=(12, 6))
    sns.boxplot(x="Survived",hue="Sex", y="Fare", data=df, palette="Set2")
    plt.title("Survival vs. Fare")
    plt.xlabel("Survival (0 = No, 1 = Yes)")
    plt.ylabel("Fare")
    plt.show()

# Run the survival vs fare visualization
if df is not None:
    survival_by_fare(df)


In [None]:
def drop_unnecessary_columns(df):
    # Example: Drop columns with too many missing values or irrelevant columns
    columns_to_drop = ['Fare']  # Adjust based on visualizations
    df = df.drop(columns=columns_to_drop, axis=1)
    print("\n Unnecessary columns dropped!")
    return df

# Drop unnecessary columns
if df is not None:
    df = drop_unnecessary_columns(df)


In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Example: Loading and splitting data (replace with your dataset)
from sklearn.datasets import load_iris
data = load_iris()
X = data.data
y = data.target

# Introduce some missing values artificially
import numpy as np
X[::5] = np.nan  # introduce missing values in every 5th row

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Mean Imputation
mean_imputer = SimpleImputer(strategy='mean')
X_train_mean_imputed = mean_imputer.fit_transform(X_train)
X_test_mean_imputed = mean_imputer.transform(X_test)

# 2. KNN Imputation
knn_imputer = SimpleImputer(strategy='most_frequent')
X_train_knn_imputed = knn_imputer.fit_transform(X_train)
X_test_knn_imputed = knn_imputer.transform(X_test)

# 3. Train Classifiers on both imputed data
# You can use any classifier you like, here RandomForest as an example:
rf = RandomForestClassifier()

# Evaluate using Cross Validation for Mean Imputation
cv_mean = cross_val_score(rf, X_train_mean_imputed, y_train, cv=5)
print("Mean Imputation CV Accuracy:", cv_mean.mean())

# Evaluate using Cross Validation for KNN Imputation
cv_knn = cross_val_score(rf, X_train_knn_imputed, y_train, cv=5)
print("KNN Imputation CV Accuracy:", cv_knn.mean())


Mean Imputation CV Accuracy: 0.775
KNN Imputation CV Accuracy: 0.7583333333333334


In [15]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/Users/pratigyajamakatel/Downloads/data.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [17]:
training = pd.read_csv('/Users/pratigyajamakatel/Downloads/data.csv')
test = pd.read_csv('/Users/pratigyajamakatel/Downloads/data.csv')

training['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.nan
all_data = pd.concat([training,test])

%matplotlib inline
all_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'train_test'],
      dtype='object')