## Import

In [101]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Path

In [102]:
## Paths
data_folder = 'resources/data/'
output_folder = 'output/'
plots_folder = 'output/plots/'

## Load data

In [103]:
train = pd.read_csv(data_folder + "train.csv")
test = pd.read_csv(data_folder + "test.csv")

## Data cleaning

In [104]:
id = test["PassengerId"].copy()
train = train.drop(['PassengerId', 'Name'], axis=1)
test = test.drop(['PassengerId', 'Name'], axis=1)
id.shape

(4277,)

In [105]:
train[["Deck", "Cabin_num", "Side"]] = train["Cabin"].str.split("/", expand=True)
train = train.drop('Cabin', axis=1)

test[["Deck", "Cabin_num", "Side"]] = test["Cabin"].str.split("/", expand=True)
test = test.drop('Cabin', axis=1)

In [106]:
test['Cabin_num'] = test['Cabin_num'].astype(float)
train['Cabin_num'] = train['Cabin_num'].astype(float)
testFloat64 = test.select_dtypes(include=['float64'])
testObject = test.select_dtypes(include=['object'])

In [107]:
for column in testFloat64:
    test[column].fillna(train[column].mean(), inplace=True)
    train[column].fillna(train[column].mean(), inplace=True)

for column in testObject:
    test[column].fillna(train[column].mode()[0], inplace=True)
    train[column].fillna(train[column].mode()[0], inplace=True)

## Dummies data

In [108]:
objTrain = train.select_dtypes(exclude=['float64']).columns
objTest = test.select_dtypes(exclude=['float64']).columns
train = pd.get_dummies(train, columns=objTrain, drop_first=True).astype(int)
test = pd.get_dummies(test, columns=objTest, drop_first=True).astype(int)

In [109]:
X_train = train.drop('Transported_True', axis=1)
y_train = train['Transported_True']

## Scaler

In [110]:
scaler = StandardScaler()
scaler.fit(X_train)
train_scaled = scaler.transform(X_train)
test_scaled = scaler.transform(test)

## Model

In [111]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(test)

In [112]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)

In [113]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [114]:
test

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_num,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,...,Destination_TRAPPIST-1e,VIP_True,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_S
0,27,0,0,0,0,0,3,0,0,1,...,1,0,0,0,0,0,0,1,0,1
1,19,0,9,0,2823,0,4,0,0,0,...,1,0,0,0,0,0,1,0,0,1
2,31,0,0,0,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,1
3,38,0,6652,0,181,585,1,1,0,0,...,1,0,0,1,0,0,0,0,0,1
4,20,10,0,635,0,0,5,0,0,0,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,34,0,0,0,0,0,1496,0,0,1,...,1,0,0,0,0,0,0,1,0,1
4273,42,0,847,17,10,144,600,0,0,0,...,1,0,0,0,0,0,1,0,0,1
4274,28,0,0,0,0,0,296,0,1,1,...,0,0,0,0,1,0,0,0,0,0
4275,28,0,2680,0,0,523,297,1,0,0,...,1,0,0,0,1,0,0,0,0,0


## Prediction

In [116]:
predictions = model.predict(test)
filename = output_folder + 'submissionLogistic.csv'
submission = pd.DataFrame({
    'PassengerId': id,
    'Transported': predictions
})

submission.to_csv(filename, index=False)
print(f"Saved: {filename}")
print(submission.head())

Saved: output/submissionLogistic.csv
  PassengerId  Transported
0     0013_01            1
1     0018_01            0
2     0019_01            1
3     0021_01            1
4     0023_01            1
