In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
data_path = "train.csv"
data = pd.read_csv(data_path)

In [None]:
# Display the first few rows of the dataset
print("Dataset Overview:")
print(data.head())

# Display dataset information
print("\nDataset Info:")
print(data.info())

# Step 3: Exploratory Data Analysis (EDA)
# Check for missing values
print("\nMissing Values:\n", data.isnull().sum())

# Check basic statistics of numerical features
print("\nBasic Statistics:\n", data.describe())

In [None]:
# Visualize the distribution of key variables
# Example: Visualize a target variable, replace 'Target' with the actual column name
if 'Target' in data.columns:
    sns.countplot(x='Target', data=data)
    plt.title("Target Variable Distribution")
    plt.show()

# Visualize correlations (numerical features)
plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Step 4: Preprocess the Data
# Drop unnecessary columns: 'PassengerId' and 'Name'
columns_to_drop = ['PassengerId', 'Name']
data = data.drop(columns=columns_to_drop, errors='ignore')

# Splitting 'Cabin' into Deck, Number, and Side
if 'Cabin' in data.columns:
    data['CabinDeck'] = data['Cabin'].str.extract(r'([A-Za-z])', expand=False)
    data['CabinNumber'] = data['Cabin'].str.extract(r'(\d+)', expand=False).astype(float)
    data['CabinSide'] = data['Cabin'].str.extract(r'([A-Za-z])$', expand=False)
    data.drop(columns=['Cabin'], inplace=True)

print("\nData after splitting 'Cabin':")
print(data.head())

In [None]:
# Step 5: Feature Encoding
# One-hot encoding for nominal features
nominal_features = data.select_dtypes(include='object').columns
print("\nNominal Features:", nominal_features)

data = pd.get_dummies(data, columns=nominal_features, drop_first=True)

print("\nData after encoding nominal features:")
print(data.head())

In [None]:
# Step 6: Handle Missing Values (Optional)
# Fill missing values (if any) with appropriate strategies
data.fillna(data.median(), inplace=True)

In [None]:
# Step 7: Split the Dataset
# Replace 'Target' with the actual target column name in your dataset
target_column = 'Target'  # Replace with your actual target column name
X = data.drop(columns=[target_column], errors='ignore')
y = data[target_column]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Display the final dataset shapes
print("\nFinal Dataset Shapes:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")

In [None]:
from sklearn.ensemble import RandomForestClassifier  # Example model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Example: Train a Random Forest Classifier (replace with your actual model and data)
clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X_train, y_train)

# Predictions
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# Metrics for Training Data
train_accuracy = accuracy_score(y_train, y_train_pred)

# Metrics for Test Data
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted')  # Adjust average as needed
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1score = f1_score(y_test, y_test_pred, average='weighted')

# Print Results
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1score:.4f}")
