In [None]:
# Import necessary libraries for data manipulation and analysis
import pandas as pd  # pandas: powerful data manipulation library
import numpy as np   # numpy: numerical computing library

In [None]:
# ========================================
# DATA SCIENCE WORKFLOW - STEP BY STEP GUIDE
# ========================================
# 1. Import the dataset
# 2. Understand the dataset (shape, columns, data types)
# 3. Understand the nature of the dataset:
#    - Is it for analysis or Machine Learning?
#    - Check distribution (bell curve, right skewed, left skewed)
# 4. Check for null values
# 5. Identify the column with most null values
# 6. Handle null values by:
#    - Removing rows/columns (recommended if >50% missing)
#    - Replacing with mean (for numerical data with normal distribution)
#    - Replacing with median (for skewed numerical data)
#    - Replacing with mode (for categorical data)
# 7. Adjust the dataframe as needed:
#    - Rename columns for clarity
#    - Change data types
#    - Format dates
# 8. Split dataset into train/test sets if needed for ML
# 9. Save the cleaned file with proper naming and documentation
# ========================================

In [None]:
# Define the file path to the dataset
# Note: Use double backslash (\\) in Windows paths to escape the backslash character
# Or use raw string r'path' or forward slash (/) which works cross-platform
data = 'C:\\Users\\vicky\\OneDrive\\Desktop\\Profound\\tested.csv'

In [None]:
# Load the CSV dataset into a pandas DataFrame
# DataFrame is a 2D labeled data structure with columns of potentially different types
df1 = pd.read_csv(data)

In [None]:
# Display the last 5 rows of the dataset to understand the data structure
# Use tail() to see the end of the dataset
# Use head() to see the first 5 rows (default)
df1.tail()

In [None]:
# Display all column names in the DataFrame
# This helps us understand what features/variables we have
# Note: columns is an attribute, not a method (no parentheses needed)
df1.columns

In [None]:
# Get comprehensive information about the DataFrame:
# - Number of entries (rows)
# - Column names and their data types
# - Non-null count (helps identify missing values)
# - Memory usage
df1.info()

In [None]:
# Check for null (missing) values in each column
# isnull() returns boolean DataFrame, sum() counts True values per column
# Results show: Age (86 nulls), Fare (1 null), Cabin (327 nulls)
df1.isnull().sum()

In [None]:
# DECISION: Handle missing values
# - Cabin column has 327 out of 418 missing (78% missing) - will DROP this column
# - Age column has 86 missing (20% missing) - will FILL with mean age
# - Fare has only 1 missing - will FILL with mean fare

In [None]:
# Drop the 'Cabin' column due to excessive missing values (78%)
# Parameters:
#   - 'Cabin': column name to drop
#   - axis=1: specifies we're dropping a column (axis=0 would drop rows)
#   - inplace=True: modifies the DataFrame directly without creating a copy
# Note: Using axis=1 instead of deprecated positional argument
df1.drop('Cabin', axis=1, inplace=True)
df1

In [None]:
# Display the updated DataFrame to verify the Cabin column was removed
df1

In [None]:
# Next step: Fill missing Age values
# Strategy: Use the average (mean) age as it's a reasonable estimate
# for missing age data in a passenger dataset

In [None]:
# Calculate the average age and round to nearest integer
# mean() calculates average, round() removes decimal places
avg_age = round(df1['Age'].mean())
print(f"Average age of passengers: {avg_age} years")
df1

In [None]:
# Fill missing Age values with the calculated average age
# fillna() replaces all NaN (null) values in the Age column
# inplace=True modifies the column directly
df1['Age'].fillna(avg_age, inplace=True)
df1

In [None]:
# Verify null values after cleaning Age column
# Should now show: Age (0 nulls), Fare (1 null remaining)
df1.isnull().sum()

In [None]:
# Generate descriptive statistics for numerical columns
# describe() shows: count, mean, std, min, 25%, 50%, 75%, max
# Useful for understanding data distribution and detecting outliers
df1.describe()

In [None]:
# Fill the single missing Fare value with the mean fare
# Using mean() directly in fillna() to calculate and fill in one step
df1['Fare'].fillna(df1['Fare'].mean(), inplace=True)

In [None]:
# Final verification: Check that all null values have been handled
# All columns should now show 0 null values
df1.isnull().sum()

In [None]:
# ========================================
# PREPARE DATA FOR MACHINE LEARNING
# ========================================
# Split the dataset into:
# - Features (X): All columns except the target variable
# - Target (y): The 'Survived' column (what we want to predict)
# Note: Convention is X for features, y for target

In [None]:
# Extract the target variable 'Survived' 
# Using double brackets [['Survived']] to get DataFrame (not Series)
y = df1[['Survived']]
print(f"Target variable type: {type(y)}")

In [None]:
# Create a copy of the full dataset for feature extraction
X = df1.copy()
X

In [None]:
# Remove the target variable from the features DataFrame
# Now X contains only the features, y contains only the target
X.drop('Survived', axis=1, inplace=True)

In [None]:
# Display the features DataFrame (X)
# Should contain all columns except 'Survived'
X

In [None]:
# Verify the target variable (y)
# Should only contain the 'Survived' column
print(f"Target variable type: {type(y)}")
y

In [None]:
# Verify the features DataFrame (X)
# Should contain 10 columns (all except 'Survived')
print(f"Features type: {type(X)}")
print(f"Features shape: {X.shape}")
X

In [None]:
# Summary: Data is now ready for Machine Learning!
# X (Features): 418 rows × 10 columns
# y (Target): 418 rows × 1 column
# No missing values
# Next steps would be:
# 1. Encode categorical variables (Sex, Embarked, etc.)
# 2. Scale numerical features if needed
# 3. Split into train/test sets
# 4. Train a model

print(f"\\nDataset Summary:")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"\\nData cleaning complete! Ready for ML pipeline.")

# Data Cleaning Complete! ✓

**What we accomplished:**
1. ✓ Loaded Titanic dataset (418 passengers)
2. ✓ Explored data structure and identified issues
3. ✓ Handled missing values:
   - Dropped Cabin column (78% missing)
   - Filled Age with mean (30 years)
   - Filled Fare with mean
4. ✓ Split data into Features (X) and Target (y)
5. ✓ Dataset ready for machine learning!

**Next steps for ML:**
- Encode categorical variables
- Feature engineering
- Train/test split
- Model training