<a href="https://colab.research.google.com/github/Sayed-Hossein-Hosseini/Saving_Titanic_Passengers_From_Disaster/blob/master/Saving_Titanic_Passengers_From_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Saving Titanic Passengers From Disaster**

## **Libraries**

In [1]:
!pip install pandas openpyxl

import io
import numpy as np
import pandas as pd



## **Data Loading**

In [2]:
# --- Settings ---
# Path to the training data Excel file
file_path_train = '/content/Titanic/train.csv'
# Path to the test data Excel file
file_path_test = '/content/Titanic/test.csv'

# If the data is not in the first sheet, specify the sheet name
# sheet_name_train = 'Sheet1' # Example
# sheet_name_test = 'Sheet1'  # Example
# --- End of Settings ---

# Initialize DataFrames to None to check if they were successfully loaded later
df_train = None
df_test = None

try:
    # Read the training data file (usually reads the first sheet by default)
    # If a different sheet is needed, use the sheet_name parameter:
    # df_train = pd.read_excel(file_path_train, sheet_name=sheet_name_train)
    df_train = pd.read_csv(file_path_train)
    print(f"Training DataFrame successfully read from file '{file_path_train}'.")
    print("First few rows of the training DataFrame:")
    print(df_train.head())
    print("-" * 30)

except FileNotFoundError:
    print(f"Error: Training file '{file_path_train}' not found. Please check the file path.")
except Exception as e:
    print(f"Error reading the training file: {e}")


try:
    # Read the test data file (usually reads the first sheet by default)
    # If a different sheet is needed, use the sheet_name parameter:
    # df_test = pd.read_excel(file_path_test, sheet_name=sheet_name_test)
    df_test = pd.read_csv(file_path_test)
    print(f"Test DataFrame successfully read from file '{file_path_test}'.")
    print("First few rows of the test DataFrame:")
    print(df_test.head())
    print("-" * 30)

    # Now you can work with df_train and df_test (if both were read successfully)
    # Example: Check if both DataFrames were loaded
    if df_train is not None and df_test is not None:
        print("Both training and test DataFrames loaded successfully.")
        # Add your data processing steps here...
    else:
        print("One or both DataFrames failed to load. Check previous error messages.")


except FileNotFoundError:
    print(f"Error: Test file '{file_path_test}' not found. Please check the file path.")
except Exception as e:
    print(f"Error reading the test file: {e}")

Training DataFrame successfully read from file '/content/Titanic/train.csv'.
First few rows of the training DataFrame:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN 

## **Data Description**

In [3]:
# ==================================================
# Assume df_train and df_test are already loaded
# ==================================================
# Example (remove or comment out this section in your final code,
# as it assumes the DataFrames already exist):
# data_train = {'col_num1': [1, 2, 3, 4, 5],
#               'col_cat1': ['A', 'B', 'A', 'C', 'B'],
#               'col_num2': [10.1, 11.2, np.nan, 13.4, 14.5],
#               'col_mixed': [1, 'X', 3, 'Y', 5]}
# df_train = pd.DataFrame(data_train)

# data_test = {'col_num1': [6, 7, 8],
#              'col_cat1': ['C', 'A', 'C'],
#              'col_num2': [15.6, 16.7, 17.8],
#              'col_mixed': ['Z', 8, 9]}
# df_test = pd.DataFrame(data_test)
# --- End of Example Section ---


# Helper function to print DataFrame information
def describe_dataframe(df, df_name):
    """
    Prints descriptive and general information about a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to describe.
        df_name (str): The name of the DataFrame for display purposes (e.g., 'Training' or 'Test').
    """
    print("\n" + "="*60)
    print(f"          {df_name} Data Description")
    print("="*60)

    # First, check if the variable is actually a DataFrame or None
    if isinstance(df, pd.DataFrame):
        print(f"\n1. Shape (Rows, Columns) of {df_name} data:")
        print(df.shape)

        print(f"\n2. Basic Information for {df_name} data (Columns, Non-Null Counts, Dtypes):")
        # df.info() prints directly, so call it after the header
        # To capture output nicely in some environments, redirect to a string buffer
        buffer = io.StringIO()
        df.info(buf=buffer)
        info_str = buffer.getvalue()
        print(info_str)

        print(f"\n3. Numerical Features Summary Statistics for {df_name} data:")
        # Using round(2) for cleaner output
        try:
            # Select only numerical columns
            numeric_cols = df.select_dtypes(include=np.number)
            if not numeric_cols.empty:
                 print(numeric_cols.describe().round(2))
            else:
                 print("No numerical columns found.")
        except Exception as e:
            print(f"Could not generate numerical description: {e}")

        print(f"\n4. Categorical/Object Features Summary Statistics for {df_name} data:")
        try:
            # Select columns with 'object' dtype (usually strings or mixed types)
            object_cols = df.select_dtypes(include='object')
            if not object_cols.empty:
                print(object_cols.describe())
            else:
                print(f"No object/categorical columns found in the {df_name} data.")
        except Exception as e:
             print(f"Could not generate categorical description: {e}")

        print(f"\n5. Missing Values per Column in {df_name} data:")
        missing_values = df.isnull().sum()
        # Show only columns that actually have missing values
        missing_values = missing_values[missing_values > 0]
        if not missing_values.empty:
            print(missing_values)
        else:
            print(f"No missing values found in the {df_name} data.")

    # If the input was None or not a DataFrame
    elif df is None:
         print(f"'{df_name}' variable is None. Cannot display description.")
    else:
         print(f"'{df_name}' variable is not a pandas DataFrame. Type: {type(df)}. Cannot display description.")


# --- Check and Display Descriptive Information ---

# First, check if the variables df_train and df_test actually exist
# This prevents errors if, for example, one of the files failed to load in a previous step.
if 'df_train' in locals() or 'df_train' in globals():
    describe_dataframe(df_train, 'Training')
else:
    print("\nError: DataFrame 'df_train' does not exist in the current environment.")

if 'df_test' in locals() or 'df_test' in globals():
    describe_dataframe(df_test, 'Test')
else:
    print("\nError: DataFrame 'df_test' does not exist in the current environment.")


print("\n" + "="*60)
print("          End of Data Description")
print("="*60)


          Training Data Description

1. Shape (Rows, Columns) of Training data:
(891, 12)

2. Basic Information for Training data (Columns, Non-Null Counts, Dtypes):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


3. Numerical Features Summary Statistics for Training data:
       Passenge