<a href="https://colab.research.google.com/github/Oelebrashy/Machine-Learning/blob/main/Spaceship_Competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
from IPython.display import display, HTML
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [11]:
# Function to display DataFrame in a scrollable format
def show_scrollable(df, height=400, width='100%'):
    return display(HTML(f"""
    <div style="height: {height}px; width: {width}; overflow: auto;">
        {df.to_html(index=False)}
    </div>
    """))


In [12]:
# Load the data
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

# Create a new feature for group ID
train_data['GroupID'] = train_data['PassengerId'].str.split('_').str[0]
test_data['GroupID'] = test_data['PassengerId'].str.split('_').str[0]

# Create a new feature to indicate if a passenger is in a group
train_data['InGroup'] = train_data.duplicated(subset=['GroupID'], keep=False)
test_data['InGroup'] = test_data.duplicated(subset=['GroupID'], keep=False)

# Fill missing spending values with 0
spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train_data[spending_cols] = train_data[spending_cols].fillna(0)
test_data[spending_cols] = test_data[spending_cols].fillna(0)

# Create a new feature for total spending
train_data['TotalSpending'] = train_data[spending_cols].sum(axis=1)
test_data['TotalSpending'] = test_data[spending_cols].sum(axis=1)

# Drop the Name column
train_data = train_data.drop(columns=['Name'])
test_data = test_data.drop(columns=['Name'])

# Fill remaining missing values for CryoSleep, Age, HomePlanet, Destination, VIP, and Cabin
train_data['CryoSleep'] = train_data['CryoSleep'].fillna(False).astype(bool)
test_data['CryoSleep'] = test_data['CryoSleep'].fillna(False).astype(bool)
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())
train_data['HomePlanet'] = train_data['HomePlanet'].fillna(train_data['HomePlanet'].mode()[0])
test_data['HomePlanet'] = test_data['HomePlanet'].fillna(test_data['HomePlanet'].mode()[0])
train_data['Destination'] = train_data['Destination'].fillna(train_data['Destination'].mode()[0])
test_data['Destination'] = test_data['Destination'].fillna(test_data['Destination'].mode()[0])
train_data['Cabin'] = train_data['Cabin'].fillna('Unknown/0/Unknown')
test_data['Cabin'] = test_data['Cabin'].fillna('Unknown/0/Unknown')
train_data['VIP'] = train_data['VIP'].fillna(False).astype(bool)
test_data['VIP'] = test_data['VIP'].fillna(False).astype(bool)

# Split the Cabin column into Deck, RoomNumber, and Side
train_data[['Deck', 'RoomNumber', 'Side']] = train_data['Cabin'].str.split('/', expand=True)
test_data[['Deck', 'RoomNumber', 'Side']] = test_data['Cabin'].str.split('/', expand=True)

# Drop the RoomNumber column
train_data = train_data.drop(columns=['RoomNumber'])
test_data = test_data.drop(columns=['RoomNumber'])

# Drop the original Cabin column
train_data = train_data.drop(columns=['Cabin'])
test_data = test_data.drop(columns=['Cabin'])

# Display cleaned up data
show_scrollable(train_data.head(15))
show_scrollable(test_data.head(15))

PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupID,InGroup,TotalSpending,Deck,Side
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,False,0.0,B,P
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,False,736.0,F,S
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,True,10383.0,A,S
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,True,5176.0,A,S
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,False,1091.0,F,S
0005_01,Earth,False,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,True,5,False,774.0,F,P
0006_01,Earth,False,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,True,6,True,1584.0,F,S
0006_02,Earth,True,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,0.0,True,6,True,0.0,G,S
0007_01,Earth,False,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,True,7,False,1018.0,F,S
0008_01,Europa,True,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,True,8,True,0.0,B,P


PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupID,InGroup,TotalSpending,Deck,Side
0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,13,False,0.0,G,S
0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,18,False,2832.0,F,S
0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,19,False,0.0,C,S
0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,21,False,7418.0,C,S
0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,23,False,645.0,F,S
0027_01,Earth,False,TRAPPIST-1e,31.0,False,0.0,1615.0,263.0,113.0,60.0,27,False,2051.0,F,P
0029_01,Europa,True,55 Cancri e,21.0,False,0.0,0.0,0.0,0.0,0.0,29,False,0.0,B,P
0032_01,Europa,True,TRAPPIST-1e,20.0,False,0.0,0.0,0.0,0.0,0.0,32,True,0.0,D,S
0032_02,Europa,True,55 Cancri e,23.0,False,0.0,0.0,0.0,0.0,0.0,32,True,0.0,D,S
0033_01,Earth,False,55 Cancri e,24.0,False,0.0,639.0,0.0,0.0,0.0,33,False,639.0,F,S


In [13]:
# Scale the numerical data
scaler = StandardScaler()
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpending']
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

# Encode categorical variables
categorical_cols = ['HomePlanet', 'Destination', 'Deck', 'Side']
train_data = pd.get_dummies(train_data, columns=categorical_cols)
test_data = pd.get_dummies(test_data, columns=categorical_cols)

# Ensure the test set has the same dummy variable columns as the train set
missing_cols = set(train_data.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[train_data.columns.drop('Transported')]

# Display the encoded data
show_scrollable(train_data.head(15))
show_scrollable(test_data.head(15))

PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupID,InGroup,TotalSpending,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown,Side_P,Side_S,Side_Unknown
0001_01,False,0.711945,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,False,1,False,-0.514066,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False
0002_01,False,-0.334037,False,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,True,2,False,-0.251479,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False
0003_01,False,2.036857,True,-0.268001,1.959998,-0.283579,5.695623,-0.219796,False,3,True,3.190333,False,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False
0003_02,False,0.293552,False,-0.333105,0.52301,0.336851,2.687176,-0.092818,False,3,True,1.332604,False,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False
0004_01,False,-0.891895,False,0.125652,-0.237159,-0.031059,0.231374,-0.26124,True,4,False,-0.124824,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False
0005_01,False,1.060606,False,-0.333105,0.021662,-0.283579,-0.012074,-0.263003,True,5,False,-0.237921,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False
0006_01,False,-0.194573,False,-0.269515,0.683441,-0.278562,-0.270626,-0.263003,True,6,True,0.051067,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False
0006_02,True,-0.055109,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,True,6,True,-0.514066,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False
0007_01,False,0.433017,False,-0.333105,0.210921,-0.255149,-0.078711,-0.263003,True,7,False,-0.150868,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False
0008_01,True,-1.031359,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,True,8,True,-0.514066,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False


PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupID,InGroup,TotalSpending,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown,Side_P,Side_S,Side_Unknown
0013_01,True,-0.124841,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,13,False,-0.514066,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False
0018_01,False,-0.682698,False,-0.333105,-0.275387,-0.283579,2.237598,-0.263003,18,False,0.496322,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False
0019_01,True,0.154088,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,19,False,-0.514066,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False
0021_01,False,0.642213,False,-0.333105,3.88768,-0.283579,-0.109808,0.252842,21,False,2.132494,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False
0023_01,False,-0.612966,False,-0.317964,-0.281027,0.778343,-0.270626,-0.263003,23,False,-0.283945,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False
0027_01,False,0.154088,False,-0.333105,0.731069,0.156241,-0.170226,-0.210096,27,False,0.217681,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False
0029_01,True,-0.543234,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,29,False,-0.514066,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False
0032_01,True,-0.612966,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,32,True,-0.514066,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False
0032_02,True,-0.40377,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,32,True,-0.514066,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False
0033_01,False,-0.334037,False,-0.333105,0.119425,-0.283579,-0.270626,-0.263003,33,False,-0.286086,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False


In [14]:
# Final feature selection
features = train_data.drop(columns=['PassengerId', 'Transported', 'GroupID'])
target = train_data['Transported']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_val)
print(f'Accuracy: {accuracy_score(y_val, y_pred)}')

# Make predictions on the test set
test_data = test_data.drop(columns=['PassengerId', 'GroupID'])  # Ensure PassengerId is not included in the test set features
test_predictions = model.predict(test_data)

# Prepare the submission
submission = sample_submission.copy()
submission['Transported'] = test_predictions
submission_path = '/content/submission.csv'
submission.to_csv(submission_path, index=False)

# Display the submission file
show_scrollable(submission.head(15))

Accuracy: 0.7826336975273146


PassengerId,Transported
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
0027_01,False
0029_01,True
0032_01,True
0032_02,True
0033_01,True
