<a href="https://colab.research.google.com/github/ShanthiniJoshitha/Spaceship-Titanic/blob/main/Spaceship_Titanic_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive for loading data from Colab repository
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


**PRE PROCESSING**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Read data from your CSV
data = pd.read_csv('/content/train.csv')

# Drop unnecessary columns
data.drop(['PassengerId', 'Name'], axis=1, inplace=True)

# Fill missing values forward for simplicity
data = data.ffill()

# Features and target
X = data.drop('Transported', axis=1)  # Transported is the target
y = data['Transported'].astype(int)  # Converting boolean to int (1 or 0)

# Split the data into training, validation, and test sets (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'bool']).columns

# Preprocessing pipeline: impute missing values, scale numerical, and encode categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean for numerical columns
            ('scaler', StandardScaler())  # Scale numerical data
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value for categorical columns
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical data and handle unknown categories
        ]), categorical_cols)
    ]
)

# Feature Engineering: Add Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Build a common preprocessing pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', poly)
])

# Apply transformations to the training, validation, and test sets
X_train_processed = pipeline.fit_transform(X_train)
X_val_processed = pipeline.transform(X_val)
X_test_processed = pipeline.transform(X_test)


  data = data.ffill()


In [None]:
data.head(10)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
5,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,True
6,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,True
7,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,0.0,True
8,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,True
9,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,True


In [None]:
data.shape

(8693, 12)

In [None]:
X.shape

(8693, 11)

In [None]:
y.shape

(8693,)

**LOGISTIC REGRESSION**

In [None]:
# Model training and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train a Logistic Regression model as an example
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train_processed, y_train)

# Evaluate on validation set
y_val_pred = model_lr.predict(X_val_processed)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(y_val, y_val_pred))

# Evaluate on test set
y_test_pred = model_lr.predict(X_test_processed)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print(classification_report(y_test, y_test_pred))


Validation Accuracy: 0.7822
              precision    recall  f1-score   support

           0       0.80      0.76      0.78       663
           1       0.77      0.80      0.78       641

    accuracy                           0.78      1304
   macro avg       0.78      0.78      0.78      1304
weighted avg       0.78      0.78      0.78      1304

Test Accuracy: 0.7899
              precision    recall  f1-score   support

           0       0.80      0.74      0.77       626
           1       0.78      0.83      0.80       678

    accuracy                           0.79      1304
   macro avg       0.79      0.79      0.79      1304
weighted avg       0.79      0.79      0.79      1304



**RANDOM FOREST**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_processed, y_train)

# Evaluate on validation set
y_val_pred = model.predict(X_val_processed)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(y_val, y_val_pred))

# Evaluate on test set
y_test_pred = model.predict(X_test_processed)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print(classification_report(y_test, y_test_pred))


Validation Accuracy: 0.7914
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       663
           1       0.79      0.78      0.79       641

    accuracy                           0.79      1304
   macro avg       0.79      0.79      0.79      1304
weighted avg       0.79      0.79      0.79      1304

Test Accuracy: 0.7860
              precision    recall  f1-score   support

           0       0.78      0.77      0.77       626
           1       0.79      0.80      0.80       678

    accuracy                           0.79      1304
   macro avg       0.79      0.79      0.79      1304
weighted avg       0.79      0.79      0.79      1304



**ENSEMBLE OF LOGISTIC REGRESSION AND RANDOM FOREST**

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the models
log_reg = LogisticRegression(max_iter=1000)
random_forest = RandomForestClassifier(n_estimators=20, random_state=42)

# Create a voting classifier
voting_classifier = VotingClassifier(
    estimators=[
        ('log_reg', log_reg),
        ('random_forest', random_forest)
    ],
    voting='hard'  # 'hard' for majority voting, 'soft' for average probability
)

# Train the voting classifier
voting_classifier.fit(X_train_processed, y_train)

# Evaluate on validation set
y_val_pred = voting_classifier.predict(X_val_processed)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(y_val, y_val_pred))

# Evaluate on test set
y_test_pred = voting_classifier.predict(X_test_processed)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print(classification_report(y_test, y_test_pred))


Validation Accuracy: 0.7799
              precision    recall  f1-score   support

           0       0.76      0.83      0.79       663
           1       0.81      0.73      0.76       641

    accuracy                           0.78      1304
   macro avg       0.78      0.78      0.78      1304
weighted avg       0.78      0.78      0.78      1304

Test Accuracy: 0.7776
              precision    recall  f1-score   support

           0       0.75      0.80      0.77       626
           1       0.80      0.76      0.78       678

    accuracy                           0.78      1304
   macro avg       0.78      0.78      0.78      1304
weighted avg       0.78      0.78      0.78      1304



**RANDOM FOREST WITH 50 ESTIMATORS**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=50,random_state=42)
model.fit(X_train_processed, y_train)

# Evaluate on validation set
y_val_pred = model.predict(X_val_processed)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(y_val, y_val_pred))

# Evaluate on test set
y_test_pred = model.predict(X_test_processed)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print(classification_report(y_test, y_test_pred))


Validation Accuracy: 0.7914
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       663
           1       0.80      0.77      0.78       641

    accuracy                           0.79      1304
   macro avg       0.79      0.79      0.79      1304
weighted avg       0.79      0.79      0.79      1304

Test Accuracy: 0.7814
              precision    recall  f1-score   support

           0       0.78      0.76      0.77       626
           1       0.79      0.80      0.79       678

    accuracy                           0.78      1304
   macro avg       0.78      0.78      0.78      1304
weighted avg       0.78      0.78      0.78      1304



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a Random Forest model
model1 = RandomForestClassifier(n_estimators=20,random_state=42)
model1.fit(X_train_processed, y_train)

# Evaluate on validation set
y_val_pred = model1.predict(X_val_processed)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(y_val, y_val_pred))

# Evaluate on test set
y_test_pred = model1.predict(X_test_processed)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print(classification_report(y_test, y_test_pred))


Validation Accuracy: 0.7860
              precision    recall  f1-score   support

           0       0.78      0.80      0.79       663
           1       0.79      0.77      0.78       641

    accuracy                           0.79      1304
   macro avg       0.79      0.79      0.79      1304
weighted avg       0.79      0.79      0.79      1304

Test Accuracy: 0.7807
              precision    recall  f1-score   support

           0       0.77      0.77      0.77       626
           1       0.79      0.79      0.79       678

    accuracy                           0.78      1304
   macro avg       0.78      0.78      0.78      1304
weighted avg       0.78      0.78      0.78      1304



In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Assuming model_lr and model_rf are already fitted

# Create a voting classifier with fitted models
voting_classifier = VotingClassifier(
    estimators=[
        ('log_reg', model_lr),   # Use the fitted logistic regression model
        ('random_forest', model)  # Use the fitted random forest model
    ],
    voting='hard'  # 'hard' for majority voting, 'soft' for average probability
)

# Fit the voting classifier using dummy data (we will only use it to set the state)
dummy_y_train = np.zeros(X_train_processed.shape[0])  # Create a dummy target array
voting_classifier.fit(X_train_processed, dummy_y_train)  # Fit with dummy data

# Now you can evaluate on validation set
y_val_pred = voting_classifier.predict(X_val_processed)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(classification_report(y_val, y_val_pred))

# Evaluate on test set
y_test_pred = voting_classifier.predict(X_test_processed)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print(classification_report(y_test, y_test_pred))


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0