In [1]:
import pandas as pd

train_set = pd.read_csv("data\Train.csv")
train_set.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [2]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [3]:
corr_matrix = train_set.corr()
print(corr_matrix["Transported"].sort_values(ascending=False))

Transported     1.000000
FoodCourt       0.046566
ShoppingMall    0.010141
Age            -0.075026
VRDeck         -0.207075
Spa            -0.221131
RoomService    -0.244611
Name: Transported, dtype: float64


In [4]:
train_set["HomePlanet"].value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

In [5]:
train_set["Destination"].value_counts()

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64

In [6]:
train_set.Transported = train_set.Transported.replace({True: 1, False: 0})

In [7]:
obj_set = train_set.iloc[:, [1, 2, 4, 6]]
num_set = train_set.iloc[:, [5, 7, 8, 9, 10, 11, 13]]

obj_set.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP
0,Europa,False,TRAPPIST-1e,False
1,Earth,False,TRAPPIST-1e,False
2,Europa,False,TRAPPIST-1e,True
3,Europa,False,TRAPPIST-1e,False
4,Earth,False,TRAPPIST-1e,False


In [8]:
num_set.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,39.0,0.0,0.0,0.0,0.0,0.0,0
1,24.0,109.0,9.0,25.0,549.0,44.0,1
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0
4,16.0,303.0,70.0,151.0,565.0,2.0,1


In [9]:
obj_set.value_counts()

HomePlanet  CryoSleep  Destination    VIP  
Earth       False      TRAPPIST-1e    False    2163
Mars        False      TRAPPIST-1e    False     803
Earth       True       TRAPPIST-1e    False     787
Europa      False      TRAPPIST-1e    False     647
Mars        True       TRAPPIST-1e    False     546
Earth       False      55 Cancri e    False     459
Europa      True       TRAPPIST-1e    False     429
                       55 Cancri e    False     420
            False      55 Cancri e    False     365
Earth       True       PSO J318.5-22  False     348
            False      PSO J318.5-22  False     330
            True       55 Cancri e    False     197
Mars        False      55 Cancri e    False     110
            True       55 Cancri e    False      74
            False      TRAPPIST-1e    True       55
Europa      False      55 Cancri e    True       49
                       TRAPPIST-1e    True       47
Mars        False      PSO J318.5-22  False      27
Europa      True    

In [10]:
training_set = train_set.drop("Transported", axis= 1)
train_set_label = train_set["Transported"].copy()

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy= "median")),
    ('std_scaler', StandardScaler()),
])

num_attribs = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
cat_attribs = ["HomePlanet", "CryoSleep", "Destination", "VIP"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

train_set_prepared = full_pipeline.fit_transform(training_set)

In [12]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(train_set_prepared, train_set_label)

In [13]:
from sklearn.metrics import mean_squared_error
import numpy as np

train_set_prediction = lin_reg.predict(train_set_prepared)
lin_mse = mean_squared_error(train_set_label, train_set_prediction)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.4047023796927919

In [14]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_set_prepared, train_set_label)

In [15]:
train_set_prediction = tree_reg.predict(train_set_prepared)
tree_mse = mean_squared_error(train_set_label, train_set_prediction)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.20584380110721287

In [16]:
from sklearn.model_selection import cross_val_score

lin_scores = cross_val_score(lin_reg, train_set_prepared, train_set_label,
                            scoring= "neg_mean_squared_error", cv= 5)
lin_rmse_scores = np.sqrt(-lin_scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(lin_rmse_scores)

Scores: [0.40603418 0.40557164 0.41015289 0.41379174 0.40151202]
Mean: 0.4074124953088548
Standard deviation: 0.0042025860132368625


In [17]:
tree_scores = cross_val_score(tree_reg, train_set_prepared, train_set_label,
                            scoring= "neg_mean_squared_error", cv= 5)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

Scores: [0.48828106 0.49196119 0.49481569 0.48202951 0.47164662]
Mean: 0.48574681536602843
Standard deviation: 0.008245230464814431


In [18]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(train_set_prepared, train_set_label)

In [19]:
train_set_prediction = forest_reg.predict(train_set_prepared)
forest_mse = mean_squared_error(train_set_label, train_set_prediction)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.2362360698273578

In [42]:
from sklearn.svm import SVC

supvect_reg = SVC()
supvect_reg.fit(train_set_prepared, train_set_label)

In [43]:
train_set_prediction = supvect_reg.predict(train_set_prepared)
supvect_mse = mean_squared_error(train_set_label, train_set_prediction)
supvect_rmse = np.sqrt(supvect_mse)
supvect_rmse

0.449061805583704

In [20]:
test_set = pd.read_csv("data\Test.csv")
test_set.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [44]:
X_test = test_set.copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = supvect_reg.predict(X_test_prepared)

In [45]:
print(final_predictions)

[1 0 1 ... 1 1 1]


In [48]:
print(len(final_predictions))

4277


In [46]:
Transported = []
PassengerId = []

for prediction in final_predictions:
    if prediction < .5:
        boolean = False
    else:
        boolean = True        
    Transported.append(boolean)
    
for i in range(len(test_set)):
    PassengerId.append(test_set["PassengerId"][i])

In [47]:
data = {"PassengerId": PassengerId, "Transported": Transported}

df = pd.DataFrame(data= data)

csv = df.to_csv("eliotcsv_3.csv", index=False)