<a href="https://colab.research.google.com/github/PremaSanthi/Kaggle-SpaceshipTitanic/blob/main/Kaggle_SpaceshipTitanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
file_path='/content/drive/MyDrive/Colab Notebooks/COMP2026_Visual Analytics/train_clean.csv'
import pandas as pd
train_clean = pd.read_csv(file_path)
train_clean.head()

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt


df_sampled = train_clean.sample(200, random_state=42)

features = ['Destination', 'CryoSleep']

# Bipartite graph
G = nx.Graph()


for _, row in df_sampled.iterrows():
    passenger = row['PassengerId']
    for feature in features:
        feature_value = f"{feature}:{row[feature]}"
        G.add_node(passenger, bipartite=0)  # Passenger node
        G.add_node(feature_value, bipartite=1,
                   destination_name=row['Destination'] if feature == 'Destination' else None,
                   cryo_sleep_value=row['CryoSleep'] if feature == 'CryoSleep' else None)  # Feature node
        G.add_edge(passenger, feature_value)

# Dictionary
node_colors = {
    'Destination:55 Cancri e': 'lightblue',
    'Destination:PSO J318.5-22': 'blue',
    'Destination:TRAPPIST-1e': 'darkblue',
    'CryoSleep:True': 'lightcoral',  # Light red for True
    'CryoSleep:False': 'red'        # Red for False
}


colors = [node_colors.get(node, 'gray') for node in G.nodes()]


plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, k=0.15)  # Adjust k for node spacing
nx.draw(G, pos, node_size=100, edge_color='gray', with_labels=False, node_color=colors)
plt.title("Feature-Based Network of Passengers", fontsize=20)


from collections import defaultdict
legend_labels = defaultdict(list)
for node, color in zip(G.nodes(), colors):
    if color != 'gray':  # Exclude passenger nodes (gray)
        label = node.split(':')[0]  # Get the variable name (Destination, CryoSleep)
        if node not in legend_labels[color]:  # Avoid duplicate entries
            legend_labels[color].append(node)

legend_handles = [plt.Line2D([0], [0], marker='o', color='w', label=f'{", ".join(labels)}',
                              markerfacecolor=color, markersize=10)
                   for color, labels in legend_labels.items()]
plt.legend(handles=legend_handles, loc='upper right', fontsize=16)


plt.show()

In [None]:
ax=sns.heatmap(pd.crosstab(train_clean['HomePlanet'], train_clean['Destination']), annot=True, cmap="Blues", fmt= '.0f')

ax.set_xlabel("Destination", fontsize=14, fontweight='bold')  # Modify x label
ax.set_ylabel("HomePlanet", fontsize=14, fontweight='bold')  # Modify y label
plt.show()

In [None]:
!pip install scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder


# 1. Prepare the data:
features = ['CryoSleep', 'Deck']
target = 'Transported'

X = train_clean[features]
y = train_clean[target]

# Handle categorical features ('Deck', 'CryoSleep') using OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # sparse=False for LogisticRegression
encoded_features = pd.DataFrame(encoder.fit_transform(X[['Deck', 'CryoSleep']]),
                                 columns=encoder.get_feature_names_out(['Deck', 'CryoSleep']),
                                 index=X.index)

X = X.drop(['Deck', 'CryoSleep'], axis=1)  # Remove original categorical columns
X = pd.concat([X, encoded_features], axis=1)  # Add encoded features

# 2. Split data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train a model (Logistic Regression in this example):
model = LogisticRegression()
model.fit(X_train, y_train)

# 4. Predict probabilities for the test set:
y_probs = model.predict_proba(X_test)[:, 1]  # Probability of class 1 (Transported)

# 5. Calculate ROC curve and AUC score:
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
auc_score = roc_auc_score(y_test, y_probs)

# 6. Plot the ROC curve:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')  # Diagonal line for random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Transported Prediction')
plt.legend(loc='lower right')
plt.show()

In [None]:
from google.colab import drive
drive.mount('/content/drive')
file_path='/content/drive/MyDrive/Colab Notebooks/COMP2026_Visual Analytics/train_clean.csv'
import pandas as pd
train_clean = pd.read_csv(file_path)

!pip install scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder



features = ['Side', 'Deck']
target = 'Transported'

X = train_clean[features]
y = train_clean[target]

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = pd.DataFrame(encoder.fit_transform(X[['Deck', 'Side']]),
                                 columns=encoder.get_feature_names_out(['Deck', 'Side']),
                                 index=X.index)

X = X.drop(['Deck', 'Side'], axis=1)
X = pd.concat([X, encoded_features], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_probs = model.predict_proba(X_test)[:, 1]  ]

fpr, tpr, thresholds = roc_curve(y_test, y_probs)
auc_score = roc_auc_score(y_test, y_probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Transported Prediction')
plt.legend(loc='lower right')
plt.show()

In [None]:
import pandas as pd
from IPython.display import display_html



def highlight_max(data):
    is_max = data == data.max().max()  # Find overall maximum
    result = pd.DataFrame('', index=data.index, columns=data.columns)
    result[is_max] = 'background-color: yellow'
    return result


styled_table_true = table_true.style.apply(highlight_max, axis=None).set_table_styles([
    {'selector': 'table', 'props': [('border', '1px solid black'), ('border-collapse', 'collapse')]},
    {'selector': 'th, td', 'props': [('border', '1px solid black'), ('padding', '8px')]}
]).set_caption("Transported = True")

styled_table_false = table_false.style.apply(highlight_max, axis=None).set_table_styles([
    {'selector': 'table', 'props': [('border', '1px solid black'), ('border-collapse', 'collapse')]},
    {'selector': 'th, td', 'props': [('border', '1px solid black'), ('padding', '8px')]}
]).set_caption("Transported = False")


table_true_html = styled_table_true.to_html()
table_false_html = styled_table_false.to_html()

display_html(f"<div style='display: flex; gap: 20px;'>{table_true_html}{table_false_html}</div>", raw=True)

In [None]:
import pandas as pd

table = pd.crosstab([train_clean['Deck'], train_clean['Side']], train_clean['Transported'])

styled_table = table.style.set_table_styles([
    {'selector': 'table', 'props': [('border', '1px solid black'), ('border-collapse', 'collapse')]},
    {'selector': 'th, td', 'props': [('border', '1px solid black'), ('padding', '8px')]}
])

display(styled_table)