# Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Loading

In [None]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test_ids = test["PassengerId"]

In [None]:
train.shape

In [None]:
train.head()

# Data Cleaning 

**Column drop**

In [None]:
# Drop columns that are present
cols_to_drop = [ "Cabin", "Name", "PassengerId"]

# Only drop columns that exist in the DataFrame
train = train.drop(columns=[col for col in cols_to_drop if col in train.columns], axis=1)
test = test.drop(columns=[col for col in cols_to_drop if col in test.columns], axis=1)

train.head()

In [None]:
def conv(df):
    df = df.replace({True: 1, False: 0},inplace=True)
    
conv(train)
conv(test)

train.head()

# Exploratory Data Analytics

**Encoder**

In [None]:
# Encode categorical variables
le = preprocessing.LabelEncoder()
cols = ["HomePlanet", "Destination"]
for col in cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    print(f"Classes for {col}: {le.classes_}")

**Missing Value**

In [None]:
train.isnull().sum()

In [None]:
def fill_missing_with_median(df, columns):
    for col in columns:
        if col in df.columns:
            median = df[col].median()
            df[col].fillna(median, inplace=True)
    return df

# Example usage:
# Specify the columns for which you want to fill missing values
columns_to_fill = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
train = fill_missing_with_median(train, columns_to_fill)
test = fill_missing_with_median(test, columns_to_fill)

train.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer

# Initialize the imputer to use the most frequent value for imputation
imputer = SimpleImputer(strategy='most_frequent')

# Apply the imputer to the 'CryoSleep' and 'VIP' columns
train[['CryoSleep', 'VIP']] = imputer.fit_transform(train[['CryoSleep', 'VIP']])
test[['CryoSleep', 'VIP']] = imputer.fit_transform(test[['CryoSleep', 'VIP']])

train.isnull().sum()

In [None]:
train.head()

In [None]:
y=train["Transported"]
X=train.drop("Transported",axis=1)

# Preprocessing

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# Define your features
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Define preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create a pipeline that includes preprocessing, polynomial features, and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step
    ('poly', PolynomialFeatures(degree=2)),  # Polynomial features
    ('classifier', LogisticRegression(max_iter=1000))  # Logistic regression model
])

# Model Training

**Data Spliting**

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

**Model Defining and Training**

In [None]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

**Accuracy Score**

In [None]:
pred=clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

# Predictions

In [None]:
predictions = clf.predict(test)
df = pd.DataFrame({
    "PassengerId":test_ids.values,
    "Transported":predictions
})

def convb(df):
    df = df.replace({1: True, 0: False},inplace=True)
    
convb(df)
df.to_csv("spacetsubmission_1.csv",index=False)