In [None]:
# Import python packages
import streamlit as st
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
SELECT * from titanic_sample.public.titanic_full

In [None]:
test = cell2.to_pandas()
#test.fillna(test.mean(numeric_only=True).astype(int), inplace=True)  # Impute numeric with mean
#test.fillna("Unknown", inplace=True)  # Impute categorical with "Unknown"

print(train_data.head())

In [None]:


def prepare_titanic_data(combined_data=cell2.to_pandas(), test_path=""):
    """
    Prepares Titanic Kaggle competition data for machine learning.

    Args:
        train_path (str, optional): Path to the training data CSV file.
            Defaults to "train.csv".
        test_path (str, optional): Path to the testing data CSV file.
            Defaults to "test.csv".

    Returns:
        tuple: A tuple containing two Pandas DataFrames:
            - prepared_train_data: The prepared training data.
            - prepared_test_data: The prepared testing data.
    """
   
    # Create new features from existing columns
    names_titles = combined_data['NAME'].str.split('.',expand = True)[0]
    combined_data["Title"] =  names_titles.str.split(', ', expand = True)[1]
    combined_data["Ticket_FamilySize"] = combined_data["TICKET"].str.split(expand=True)[0].value_counts().astype(int)
    combined_data["Ticket_number"] = pd.to_numeric(combined_data['TICKET'], errors='coerce')
    combined_data["Has_Cabin"] = combined_data["CABIN"].notnull()
     # Handle missing values
    combined_data.fillna(combined_data.mean(numeric_only=True), inplace=True)  # Impute numeric with mean
    combined_data.fillna("Unknown", inplace=True)  # Impute categorical with "Unknown"
    # Drop unnecessary columns
    combined_data.drop(["NAME", "TICKET", "CABIN"], axis=1, inplace=True)

    # One-hot encode categorical features
    categorical_data = combined_data.select_dtypes(include=[object])
    encoder = preprocessing.LabelEncoder()
    encoded_data =categorical_data.apply(encoder.fit_transform)
    combined_data.drop(["Title",'Ticket_FamilySize'], axis=1, inplace=True)
    combined_data = pd.concat([combined_data.drop(categorical_cols, axis=1), encoded_data], axis=1)
    return combined_data

# Example usage
train_data = prepare_titanic_data()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import \
train_test_split,KFold, cross_val_score
from sklearn.metrics import \
accuracy_score, classification_report,confusion_matrix

In [None]:
X = train_data.drop('SURVIVED', axis=1)
y = train_data['SURVIVED']

In [None]:
X

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kfold.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index] 
    
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model on the entire dataset
rf_classifier.fit(X_train, y_train)

importances = rf_classifier.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_classifier.estimators_], axis=0)
    
feature_names = X_train.columns
forest_importances = pd.Series(importances, index=feature_names)
forest_importances



In [None]:
# Assuming you have a separate test set X_test and y_test
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", accuracy)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf_classifier, param_grid, cv=5)
grid_search.fit(X, y)

best_rf_classifier = grid_search.best_estimator_

In [None]:
# Perform cross-validation
cv_results = cross_val_score(rf_classifier, X, y, cv=kfold, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_results)
print("Mean Cross-Validation Accuracy:", cv_results.mean())