<a href="https://colab.research.google.com/github/VIJAYARAGUL362/SPACE-TITANIC-CHALLENGE/blob/main/SPACESHIP_TITANIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORTING THE LIBRARIES

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# IMPORTING THE DATASET

## GETTING THE DATA FROM KAGGLE

In [None]:
# GETTING THE KAGGLE API KEY
from google.colab import files
files.upload()

In [None]:
# INSTALLING KAGGLE
!pip install kaggle

In [None]:
# Create the .kaggle directory if it doesn't exist
!mkdir -p ~/.kaggle/

# Move the uploaded kaggle.json to the .kaggle directory
!mv kaggle.json ~/.kaggle/

# Set read/write permissions for the owner only (secure)
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# DOWNLOADING THE DATASET
!kaggle competitions download -c spaceship-titanic

In [None]:
import os
import zipfile

# Replace 'your_dataset_name.zip' with the actual name of your downloaded zip file
zip_file_name = 'spaceship-titanic.zip' # Example for the Titanic dataset

if os.path.exists(zip_file_name):
    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall('.') # Extracts all contents to the current directory
    print(f"'{zip_file_name}' unzipped successfully.")
else:
    print(f"Error: '{zip_file_name}' not found. Check the filename.")

# Optional: List files again to see the unzipped contents
!ls

## LOADING THE DATASET

In [None]:
train_dataset = pd.read_csv('train.csv')
train_dataframe = pd.DataFrame(train_dataset)

In [None]:
train_dataframe

In [None]:
train_dataframe.info()

In [None]:
# SEGREGATING THE DATASET
categorical_feature = train_dataframe.select_dtypes('object')
numerical_feature = train_dataframe.select_dtypes('float')
target_feature = train_dataframe.select_dtypes('bool')

In [None]:
categorical_feature.drop(columns=["PassengerId","Name"],inplace=True,axis=1)

In [None]:
categorical_feature

In [None]:
numerical_feature

# DATA PROCESSING

## CATEGORICAL VARIABLE

In [None]:
# MISSING VALUES
categorical_feature['HomePlanet'].fillna('Earth',inplace=True)

In [None]:
# MISSING VALUES
categorical_feature['CryoSleep'].fillna(False,inplace=True)

In [None]:
# MISSING VALUES
categorical_feature['Cabin'].fillna("G",inplace=True)

In [None]:
# DATA MANIPULATION
categorical_feature['Cabin'] = categorical_feature['Cabin'].apply(lambda x: x[0])

In [None]:
index_id_cabin,unique_values_cabin = categorical_feature['Cabin'].factorize()

In [None]:
index_id_cabin,unique_values_cabin

In [None]:
categorical_feature['Cabin'] = index_id_cabin

In [None]:
# MISSING VALUES
categorical_feature['Destination'].fillna("T",inplace=True)

In [None]:
categorical_feature['Destination'].unique()

In [None]:
# DATA MANIPULATION
index_id_dest,unique_values_dest = categorical_feature['Destination'].factorize()

In [None]:
categorical_feature['Destination'] = index_id_dest

In [None]:
# MISSING VALUES
categorical_feature['VIP'].fillna(False,inplace = True)

In [None]:
categorical_feature.isna().sum()

### ONE HOT ENCODING THE CATEGORICAL DATASET

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(drop="first"),['HomePlanet','CryoSleep','Cabin','Destination','VIP'])],remainder="passthrough")

In [None]:
encoded_cat = ct.fit_transform(categorical_feature)

In [None]:
encoded_cat

### CREATING A NEW CATEGORICAL DATAFRAME

In [None]:
cat_column_names = list(ct.get_feature_names_out())

In [None]:
new_cat_dataframe = pd.DataFrame(encoded_cat.toarray(),columns=cat_column_names)

In [None]:
new_cat_dataframe

## NUMERICAL VARIABLE

In [None]:
numerical_feature

In [None]:
# DATAFRAME INFO
numerical_feature.info()

In [None]:
numerical_feature.isna().sum()

In [None]:
# MISSING VALUES
numerical_feature.fillna(numerical_feature['Age'].mean(),inplace=True)

In [None]:
# MISSING VALUES
numerical_feature['RoomService'].fillna(numerical_feature['RoomService'].mean(),inplace=True)

In [None]:
# MISSING VALUES
numerical_feature['ShoppingMall'].fillna(numerical_feature['ShoppingMall'].mean(),inplace=True)

In [None]:
# MISSING VALUES
numerical_feature['Spa'].fillna(numerical_feature['Spa'].mean(),inplace=True)

In [None]:
# MISSING VALUES
numerical_feature['VRDeck'].fillna(numerical_feature['VRDeck'].mean(),inplace=True)

In [None]:
numerical_feature

In [None]:
numerical_feature.isna().sum()

### SCALING THE NUMERICAL VARIABLE

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
scaled_num = sc.fit_transform(numerical_feature)

### CREATING A NEW NUMERICAL DATAFRAME

In [None]:
num_column_names = sc.get_feature_names_out()
num_column_names

In [None]:
new_num_dataframe = pd.DataFrame(scaled_num,columns=num_column_names)

In [None]:
new_num_dataframe

# CREATING A TRAINING DATAFRAME

In [None]:
training_dataframe = pd.concat([new_cat_dataframe,new_num_dataframe],axis=1)

In [None]:
training_dataframe

# TARGET VARIABLE

In [None]:
index_target,unique_values_target = target_feature['Transported'].factorize()

In [None]:
index_target,unique_values_target

In [None]:
target_feature['Transported'] = index_target

In [None]:
target_feature

# SPLITTING THE DATASET

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(training_dataframe,target_feature,test_size=0.2)

In [None]:
X_train.shape,X_test.shape

In [None]:
y_train.shape,y_test.shape

# CREATING THE MODEL

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel="rbf")

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

# TUNING THE MODEL

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# RANDOM FOREST
param_grid = {
    'n_estimators': [100, 200],  # Number of trees
    'max_depth': [10, 20, None],  # Maximum depth of trees
    'min_samples_split': [2, 5],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2],   # Minimum samples at a leaf
    'max_features': ['sqrt', 'log2']  # Number of features to consider at each split
}

tuned_rfc = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1_macro',  # Optimize for F1-score (macro average)
    n_jobs=-1,  # Use all available CPU cores
    verbose=1
)

# TRAINING THE MODEL

In [None]:
svc.fit(X_train,y_train)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
lr.fit(X_train,y_train)

In [None]:
dtc.fit(X_train,y_train)

In [None]:
# tuned_rfc.fit(X_train,y_train)

# EVALUATING THE MODEL

In [None]:
from sklearn.metrics import classification_report

In [None]:
y_pred_svc = svc.predict(X_test)
score_svc = classification_report(y_test,y_pred_svc)
score_svc

In [None]:
y_pred_rfc = rfc.predict(X_test)
score_rfc = classification_report(y_test,y_pred_rfc)
score_rfc

In [None]:
y_pred_lr = lr.predict(X_test)
score_lr = classification_report(y_test,y_pred_lr)
score_lr

In [None]:
y_pred_dtc = dtc.predict(X_test)
score_dtc = classification_report(y_test,y_pred_dtc)
score_dtc

In [None]:
# y_pred_trfc = tuned_rfc.predict(X_test)
# score_t_rfc = classification_report(y_test,y_pred_dtc)
# score_t_rfc

# DEEP LEARNING

## IMPORTING LIBRARIES

In [None]:
import torch
from torch import nn

## DEVICE AGNOSTIC CODE

In [None]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [None]:
device

## MODEL ARCHIETECTURE

In [None]:
class classification_model(nn.Module):
    def __init__(self) -> None:
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(in_features=20,out_features=32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(in_features=32,out_features=64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(in_features=64,out_features=1)
        )

    def forward(self,x):
        return self.layers(x)

In [None]:
model = classification_model()

## OPTIMIZER AND COST FUNCTION

In [None]:
loss = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

## MOVING TO DATSET TO THE TENSORS

In [None]:
X_train = torch.tensor(X_train.values,dtype=torch.float).to(device)
X_test = torch.tensor(X_test.values,dtype=torch.float).to(device)
y_train = torch.tensor(y_train.values,dtype=torch.float).to(device)
y_test = torch.tensor(y_test.values,dtype=torch.float).to(device)

In [None]:
X_train.shape,X_test.shape

## TRAINING THE MODEL

In [None]:
epochs = 300

training_loss = []
testing_loss = []
epoch_list = []

for epoch in range(epochs):

    # TRAINING THE MODE
    model.train()

    # FORWARD PROPAGATION
    y_train_pred = model(X_train)
    train_loss = loss(y_train_pred,y_train)

    # OPTIMIZER ZERO GRAD
    optimizer.zero_grad()

    # BACK PROPOGATION
    train_loss.backward()

    # GRADIENT DESCENT
    optimizer.step()

    # EVALUATING THE MODEL
    model.eval()
    with torch.inference_mode():
        y_test_pred = model(X_test)
        test_loss = loss(y_test_pred,y_test)

        if epoch%100 == 0:
            training_loss.append(train_loss)
            testing_loss.append(test_loss)
            print(f"Epoch is {epoch}:The training loss is {train_loss} and the test loss is {test_loss}")

In [None]:
# MODEL PREDICTION
with torch.inference_mode():
    y_pred_logits = model(X_test)
    sigmoid = nn.Sigmoid()
    y_pred_nn = torch.round(sigmoid(y_pred_logits))

In [None]:
y_pred_nn

In [None]:
score_nn = classification_report(y_test,y_pred_nn)

In [None]:
score_nn

# TEST DATASET

## IMPORTING THE DATASET

In [None]:
test_dataset = pd.read_csv('test.csv')
test_dataframe = pd.DataFrame(test_dataset)

In [None]:
test_dataframe

In [None]:
test_dataframe.info()

In [None]:
# SEGREGATING THE DATASET
categorical_feature_test = test_dataframe.select_dtypes('object')
numerical_feature_test = test_dataframe.select_dtypes('float')

## DATA PRE PROCESSING

### CATEGORICAL FEATURE

In [None]:
# MISSING VALUES
categorical_feature_test['Cabin'].fillna("G",inplace=True)
categorical_feature_test['HomePlanet'].fillna('Earth',inplace=True)
categorical_feature_test['CryoSleep'].fillna(False,inplace=True)
categorical_feature_test['Cabin'] = categorical_feature_test['Cabin'].apply(lambda x: x[0])

In [None]:
categorical_feature_test['VIP'].fillna(False,inplace = True)

In [None]:
unique_values_cabin

In [None]:
unique_values_dest

In [None]:
categorical_feature_test['Destination'] = categorical_feature_test['Destination'].fillna("TRAPPIST-1e")

In [None]:
categorical_feature_test

In [None]:
test_passenger_id = pd.DataFrame(categorical_feature_test['PassengerId'],columns=["PassengerId"])
categorical_feature_test.drop(["PassengerId","Name"],inplace=True,axis=1)

In [None]:
categorical_feature_test['Cabin'] = categorical_feature_test['Cabin'].apply(lambda x: list(unique_values_cabin).index(x))

In [None]:
categorical_feature_test['Destination'] = categorical_feature_test['Destination'].apply(lambda x: list(unique_values_dest).index(x))

In [None]:
categorical_feature_test

In [None]:
## SCALING THE DATASET
enc_test_cat_dataset_sparse = ct.transform(categorical_feature_test)

In [None]:
enc_cat_test = pd.DataFrame(enc_test_cat_dataset_sparse.toarray(),columns=ct.get_feature_names_out())

In [None]:
enc_cat_test

### NUMERICAL DATASET

In [None]:
numerical_feature_test

In [None]:
numerical_feature_test.info()

In [None]:
numerical_feature_test.isna().sum()

In [None]:
# MISSING VALUES
numerical_feature_test.fillna(numerical_feature_test['Age'].mean(),inplace=True)
numerical_feature_test['RoomService'].fillna(numerical_feature_test['RoomService'].mean(),inplace=True)
numerical_feature_test['ShoppingMall'].fillna(numerical_feature_test['ShoppingMall'].mean(),inplace=True)
numerical_feature_test['Spa'].fillna(numerical_feature_test['Spa'].mean(),inplace=True)
numerical_feature_test['VRDeck'].fillna(numerical_feature_test['VRDeck'].mean(),inplace=True)

In [None]:
numerical_feature_test

In [None]:
scaled_test_num_array = sc.transform(numerical_feature_test)

In [None]:
scaled_test_num = pd.DataFrame(scaled_test_num_array,columns=sc.get_feature_names_out())

In [None]:
scaled_test_num

### COMBINING THE BOTH THE FEATURES

In [None]:
new_test_dataset = pd.concat([enc_cat_test,scaled_test_num],axis=1)

In [None]:
new_test_dataset

## INFERENCE ON THE TEST DATASET

In [None]:
# MODEL PREDICTION
tensor_test_dataset = torch.tensor(new_test_dataset.values,dtype=torch.float).to(device)
with torch.inference_mode():
    y_pred_logits_test = model(tensor_test_dataset)
    sigmoid = nn.Sigmoid()
    y_pred_nn_test = torch.round(sigmoid(y_pred_logits_test))

In [None]:
y_pred_nn_test

In [None]:
test_passenger_id['Transported'] = y_pred_nn_test

In [None]:
test_passenger_id['Transported'] = test_passenger_id['Transported'].apply(lambda x:True if x == 1 else False)

In [None]:
test_passenger_id.to_csv('NEUREL_NETWORK_PREDICTION.csv',index=False)

# TRAINING THE MODELS ON WHOLE DATASET

In [None]:
model2 = classification_model()

In [None]:
loss1 = nn.BCEWithLogitsLoss()

optimizer1 = torch.optim.Adam(model2.parameters(), lr=0.1)

In [None]:
epochs = 300

training_loss = []
testing_loss = []
epoch_list = []

torch_training_dataset = torch.tensor(training_dataframe.values,dtype=torch.float)
torch_target_variable = torch.tensor(target_feature.values,dtype=torch.float)


for epoch in range(epochs):

    # TRAINING THE MODE
    model2.train()

    # FORWARD PROPAGATION
    y_train_pred = model2(torch_training_dataset)
    train_loss = loss1(y_train_pred,torch_target_variable)

    # OPTIMIZER ZERO GRAD
    optimizer1.zero_grad()

    # BACK PROPOGATION
    train_loss.backward()

    # GRADIENT DESCENT
    optimizer1.step()

    # EVALUATING THE MODEL
    # model.eval()
    # with torch.inference_mode():
    #     y_test_pred = model(X_test)
    #     test_loss = loss(y_test_pred,y_test)

    #     if epoch%100 == 0:
    #         training_loss.append(train_loss)
    #         testing_loss.append(test_loss)
    #         print(f"Epoch is {epoch}:The training loss is {train_loss} and the test loss is {test_loss}")

In [None]:
# MODEL PREDICTION
tensor_test_dataset = torch.tensor(new_test_dataset.values,dtype=torch.float).to(device)
with torch.inference_mode():
    y_pred_logits_test = model2(tensor_test_dataset)
    sigmoid = nn.Sigmoid()
    y_pred_nn_test = torch.round(sigmoid(y_pred_logits_test))

In [None]:
test_passenger_id['Transported'] = y_pred_nn_test

In [None]:
test_passenger_id['Transported'] = test_passenger_id['Transported'].apply(lambda x:True if x == 1 else False)

In [None]:
test_passenger_id.to_csv('NEUREL_NETWORK_PREDICTION.csv',index=False)