In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize


In [2]:
train = pd.read_csv("D:\\vs code\\spaceship-titanic\\train.csv")
test = pd.read_csv("D:\\vs code\\spaceship-titanic\\test.csv")
y_test = pd.read_csv("D:\\vs code\\spaceship-titanic\\sample_submission.csv")

In [3]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
# remove target and unimportant columns on our dataset:

y_train = train["Transported"]
train_name = train["Name"]
train_id = train["PassengerId"]
train = train.drop(["Name","PassengerId","Cabin","Transported"],axis=1)

In [6]:
print("HomePlanet",train["HomePlanet"].unique())
print("Destination",train["Destination"].unique())


HomePlanet ['Europa' 'Earth' 'Mars' nan]
Destination ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]


In [7]:
print("HomePlanet: ",train["HomePlanet"].value_counts())
print("Destination",train["Destination"].value_counts())

HomePlanet:  HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: count, dtype: int64
Destination Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: count, dtype: int64


In [8]:
# now we want to change type of object to int or float (expect: boolian type:)
def change_type(df):
    df["HomePlanet"] = df["HomePlanet"].map({"Earth":3,"Europa":2,"Mars":1})
    df["Destination"] = df["Destination"].map({"TRAPPIST-1e":3,"55 Cancri e":2,"PSO J318.5-22":1})
    return df


In [9]:
# fill nan in datasets
def fill_nan(df):
    df["VIP"] = df["VIP"].fillna(False)
    df["CryoSleep"] = df["CryoSleep"].fillna(False)
    
    df["VRDeck"] = df["VRDeck"].fillna(df["VRDeck"].median())
    df["Spa"] = df["Spa"].fillna(df["Spa"].median())
    df["ShoppingMall"] = df["ShoppingMall"].fillna(df["ShoppingMall"].median())
    df["FoodCourt"] = df["FoodCourt"].fillna(df["FoodCourt"].median())
    df["RoomService"] = df["RoomService"].fillna(df["RoomService"].median())
    
    df["HomePlanet"] = df["HomePlanet"].fillna(method="ffill")
    df["Destination"] = df["Destination"].fillna(method="ffill")
    return df


In [10]:
# convert boolian types to int or float:
def change_bool(df):
    df["CryoSleep"] = df["CryoSleep"].astype(int)
    df["VIP"] = df["VIP"].astype(int)
    return df
 

In [11]:
change_type(train)
fill_nan(train)
change_bool(train)


  df["VIP"] = df["VIP"].fillna(False)
  df["CryoSleep"] = df["CryoSleep"].fillna(False)
  df["HomePlanet"] = df["HomePlanet"].fillna(method="ffill")
  df["Destination"] = df["Destination"].fillna(method="ffill")


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,2.0,0,3.0,39.0,0,0.0,0.0,0.0,0.0,0.0
1,3.0,0,3.0,24.0,0,109.0,9.0,25.0,549.0,44.0
2,2.0,0,3.0,58.0,1,43.0,3576.0,0.0,6715.0,49.0
3,2.0,0,3.0,33.0,0,0.0,1283.0,371.0,3329.0,193.0
4,3.0,0,3.0,16.0,0,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...
8688,2.0,0,2.0,41.0,1,0.0,6819.0,0.0,1643.0,74.0
8689,3.0,1,1.0,18.0,0,0.0,0.0,0.0,0.0,0.0
8690,3.0,0,3.0,26.0,0,0.0,0.0,1872.0,1.0,0.0
8691,2.0,0,2.0,32.0,0,0.0,1049.0,0.0,353.0,3235.0


In [12]:
train["Transported"] = y_train
train["Transported"] = train["Transported"].astype(int)

# remove the nan value:
train = train.dropna()
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8514 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8514 non-null   float64
 1   CryoSleep     8514 non-null   int32  
 2   Destination   8514 non-null   float64
 3   Age           8514 non-null   float64
 4   VIP           8514 non-null   int32  
 5   RoomService   8514 non-null   float64
 6   FoodCourt     8514 non-null   float64
 7   ShoppingMall  8514 non-null   float64
 8   Spa           8514 non-null   float64
 9   VRDeck        8514 non-null   float64
 10  Transported   8514 non-null   int32  
dtypes: float64(8), int32(3)
memory usage: 698.4 KB


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8514 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8514 non-null   float64
 1   CryoSleep     8514 non-null   int32  
 2   Destination   8514 non-null   float64
 3   Age           8514 non-null   float64
 4   VIP           8514 non-null   int32  
 5   RoomService   8514 non-null   float64
 6   FoodCourt     8514 non-null   float64
 7   ShoppingMall  8514 non-null   float64
 8   Spa           8514 non-null   float64
 9   VRDeck        8514 non-null   float64
 10  Transported   8514 non-null   int32  
dtypes: float64(8), int32(3)
memory usage: 698.4 KB


In [14]:
train.groupby(["HomePlanet","Transported"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
HomePlanet,Transported,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,0,836,836,836,836,836,836,836,836,836
1.0,1,917,917,917,917,917,917,917,917,917
2.0,0,727,727,727,727,727,727,727,727,727
2.0,1,1402,1402,1402,1402,1402,1402,1402,1402,1402
3.0,0,2663,2663,2663,2663,2663,2663,2663,2663,2663
3.0,1,1969,1969,1969,1969,1969,1969,1969,1969,1969


In [15]:
train.groupby(["Destination","Transported"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,HomePlanet,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
Destination,Transported,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,0,399,399,399,399,399,399,399,399,399
1.0,1,403,403,403,403,403,403,403,403,403
2.0,0,707,707,707,707,707,707,707,707,707
2.0,1,1098,1098,1098,1098,1098,1098,1098,1098,1098
3.0,0,3120,3120,3120,3120,3120,3120,3120,3120,3120
3.0,1,2787,2787,2787,2787,2787,2787,2787,2787,2787


In [16]:
train.groupby(["VIP","Transported"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,HomePlanet,CryoSleep,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
VIP,Transported,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,4104,4104,4104,4104,4104,4104,4104,4104,4104
0,1,4212,4212,4212,4212,4212,4212,4212,4212,4212
1,0,122,122,122,122,122,122,122,122,122
1,1,76,76,76,76,76,76,76,76,76


In [17]:
y_train = train["Transported"]
train = train.drop("Transported",axis=1)
norm = normalize(train.values)

train = pd.DataFrame(norm,columns=train.columns)
train

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0.051064,0.000000,0.076596,0.995754,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.005332,0.000000,0.005332,0.042659,0.000000,0.193744,0.015997,0.044437,0.975828,0.078208
2,0.000263,0.000000,0.000394,0.007623,0.000131,0.005652,0.470011,0.000000,0.882586,0.006440
3,0.000557,0.000000,0.000835,0.009186,0.000000,0.000000,0.357157,0.103278,0.926715,0.053727
4,0.004528,0.000000,0.004528,0.024148,0.000000,0.457303,0.105648,0.227897,0.852726,0.003019
...,...,...,...,...,...,...,...,...,...,...
8509,0.000285,0.000000,0.000285,0.005845,0.000143,0.000000,0.972108,0.000000,0.234224,0.010549
8510,0.163908,0.054636,0.054636,0.983445,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8511,0.001602,0.000000,0.001602,0.013888,0.000000,0.000000,0.000000,0.999901,0.000534,0.000000
8512,0.000585,0.000000,0.000585,0.009359,0.000000,0.000000,0.306792,0.000000,0.103239,0.946114


In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score


In [40]:
# svm 
svm = SVC(kernel="poly")
svm.fit(train,y_train)
y_pred = svm.predict(train)

accuracy_score(y_true=y_train,y_pred=y_pred)
scores = cross_val_score(svm, train,y_train, cv=6)
scores

array([0.79069767, 0.78083157, 0.77307963, 0.80902044, 0.80831572,
       0.79985906])

In [41]:
# DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=40,)
tree.fit(train,y_train)
y_pred = tree.predict(train)

print("acc",accuracy_score(y_true=y_train,y_pred=y_pred))
scores = cross_val_score(tree, train,y_train, cv=5)
scores

acc 0.9304674653511863


array([0.74221961, 0.73576042, 0.72871403, 0.73517322, 0.73678026])

In [21]:
# KNN
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(train,y_train)
y_pred = knn.predict(train)

print("acc",accuracy_score(y_true=y_train,y_pred=y_pred))
scores = cross_val_score(knn, train,y_train, cv=5)
scores

acc 0.8185341789992953


array([0.78860834, 0.77099237, 0.76981797, 0.79095713, 0.79553467])

In [22]:
# test:
test_name = test["Name"]
test_id = test["PassengerId"]
test = test.drop(["Name","PassengerId","Cabin",],axis=1)

In [23]:
change_type(test)
fill_nan(test)
change_bool(test)


  df["VIP"] = df["VIP"].fillna(False)
  df["CryoSleep"] = df["CryoSleep"].fillna(False)
  df["HomePlanet"] = df["HomePlanet"].fillna(method="ffill")
  df["Destination"] = df["Destination"].fillna(method="ffill")


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,3.0,1,3.0,27.0,0,0.0,0.0,0.0,0.0,0.0
1,3.0,0,3.0,19.0,0,0.0,9.0,0.0,2823.0,0.0
2,2.0,1,2.0,31.0,0,0.0,0.0,0.0,0.0,0.0
3,2.0,0,3.0,38.0,0,0.0,6652.0,0.0,181.0,585.0
4,3.0,0,3.0,20.0,0,10.0,0.0,635.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4272,3.0,1,3.0,34.0,0,0.0,0.0,0.0,0.0,0.0
4273,3.0,0,3.0,42.0,0,0.0,847.0,17.0,10.0,144.0
4274,1.0,1,2.0,,0,0.0,0.0,0.0,0.0,0.0
4275,2.0,0,2.0,,0,0.0,2680.0,0.0,0.0,523.0


In [24]:
test["Transported"] = y_test["Transported"]
test["Transported"] = test["Transported"].astype(int)

# remove the nan value:
test = test.dropna()
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4186 entries, 0 to 4276
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4186 non-null   float64
 1   CryoSleep     4186 non-null   int32  
 2   Destination   4186 non-null   float64
 3   Age           4186 non-null   float64
 4   VIP           4186 non-null   int32  
 5   RoomService   4186 non-null   float64
 6   FoodCourt     4186 non-null   float64
 7   ShoppingMall  4186 non-null   float64
 8   Spa           4186 non-null   float64
 9   VRDeck        4186 non-null   float64
 10  Transported   4186 non-null   int32  
dtypes: float64(8), int32(3)
memory usage: 343.4 KB


In [25]:
y_test = test["Transported"]
test = test.drop("Transported",axis=1)
norm_test = normalize(test.values)

test = pd.DataFrame(norm_test,columns=test.columns)
test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0.109691,0.036564,0.109691,0.987218,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.001063,0.000000,0.001063,0.006730,0.0,0.000000,0.003188,0.000000,0.999971,0.000000
2,0.064216,0.032108,0.064216,0.995350,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000299,0.000000,0.000449,0.005688,0.0,0.000000,0.995773,0.000000,0.027095,0.087572
4,0.004721,0.000000,0.004721,0.031476,0.0,0.015738,0.000000,0.999358,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
4181,0.000260,0.000000,0.000779,0.011164,0.0,0.012203,0.000000,0.999863,0.000000,0.000000
4182,0.003464,0.000000,0.003464,0.046193,0.0,0.000000,0.998915,0.000000,0.003464,0.000000
4183,0.087519,0.029173,0.087519,0.991882,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4184,0.003487,0.000000,0.003487,0.048814,0.0,0.000000,0.984407,0.019758,0.011622,0.167361


In [39]:
y_pred = svm.predict(test)
y_test[0] = 1

accuracy_score(y_true=y_test,y_pred=y_pred)
#scores = cross_val_score(svm, test,y_test, cv=2)
#scores

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test[0] = 1
1 fits failed out of a total of 2.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Asus\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Asus\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs

In [42]:
y_pred = tree.predict(test)

print("acc",accuracy_score(y_true=y_test,y_pred=y_pred))
scores = cross_val_score(tree, test,y_test, cv=5)
scores

acc 0.4519827998088868




array([0.99880668, 1.        , 1.        , 1.        , 1.        ])

In [43]:
y_pred = knn.predict(train)

print("acc",accuracy_score(y_true=y_train,y_pred=y_pred))
scores = cross_val_score(knn, train,y_train, cv=5)
scores

acc 0.8185341789992953


array([0.78860834, 0.77099237, 0.76981797, 0.79095713, 0.79553467])