In [1312]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier

# Random
import random

# Recuperation des DataSet


In [1313]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_submission = pd.read_csv('sample_submission.csv')

df_used = df_train.copy()

# Fonctions de remplissage des données

In [1314]:
#Colonnes représentant les dépenses
EXPENSE_COLUMNS = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

### Ajout de la colonne total_expenses (somme des dépenses)

In [1315]:
def add_total_expenses(df):
    df['total_expenses'] = df[EXPENSE_COLUMNS].sum(axis=1)
    return df

### Remplissage des données manquantes de la colonne CryoSleep

In [1316]:
def fill_cryosleep(df):
    # Récupération des elements manquants dans la colonne CryoSleep et des dépenses totales = 0
    mask_missing_cryosleep = df['CryoSleep'].isnull()
    mask_zero_expenses = df['total_expenses'] == 0

    # Si CryoSleep est manquant et dépenses totales = 0, alors CryoSleep = True
    df.loc[mask_missing_cryosleep & mask_zero_expenses, 'CryoSleep'] = True
    df.loc[mask_missing_cryosleep & ~mask_zero_expenses, 'CryoSleep'] = False
    cryosleep_mask = df['CryoSleep']

    # Si CryoSleep = True, alors dépenses = 0
    for col in EXPENSE_COLUMNS:
        df.loc[cryosleep_mask, col] = 0
    return df

### Remplissage des données manquantes de Cabin

In [1317]:
def assign_cabin_deck(home_planet):
    if home_planet == 'Earth':
        return random.choices(['G', 'F'], weights=[2, 1])[0]
    elif home_planet == 'Europa':
        return random.choices(['B', 'C'], weights=[1, 1])[0]
    elif home_planet == 'Mars':
        return random.choices(['F', 'E'], weights=[4, 1])[0]
    else:
        return 'F'

In [1318]:
def fill_missing_cabins(df):
    # Récupération des éléments manquants dans la colonne Cabin
    missing_cabin_mask = df['Cabin'].isnull()

    # Pour chaque élément manquant, on assigne une cabine en fonction de la planète d'origine
    for idx in df[missing_cabin_mask].index:
        home_planet = df.loc[idx, 'HomePlanet']
        cabin_number = random.randint(1, 2000)
        cabin_side = random.choice(['P', 'S'])
        if pd.notna(home_planet):
            new_deck = assign_cabin_deck(home_planet)
            df.loc[idx, 'Cabin'] = f"{new_deck}/{cabin_number}/{cabin_side}"
        else:
            new_deck='F'
            df.loc[idx, 'Cabin'] = f"{new_deck}/{cabin_number}/{cabin_side}"
    return df

### Remplissage des données manquantes de HomePlanet

In [1319]:
def assign_home_planet(cabin_deck):
    if cabin_deck in ['A','B','C']:
        return 'Europa'
    elif cabin_deck == 'D':
        return random.choices(['Europa', 'Mars'], weights=[2, 3])[0]
    elif cabin_deck == 'E':
        return random.choices(['Earth', 'Europa', 'Mars'], weights=[4, 1, 3])[0]
    elif cabin_deck == 'F':
        return random.choices(['Earth', 'Mars'], weights=[16, 11])[0]
    elif cabin_deck == 'G':
        return 'Earth'
    else:
        return 'Europa'

In [1320]:
def fill_missing_home_planet(df):
    # Récupération des éléments manquants dans la colonne HomePlanet
    missing_home_planet = df['HomePlanet'].isnull()

    # Pour chaque élément manquant, on assigne une planète en fonction de la cabine
    for idx in df[missing_home_planet].index:
        cabin_deck = df.loc[idx, 'Cabin'][0]
        df.loc[idx, 'HomePlanet'] = assign_home_planet(cabin_deck)
    return df

Solution provenant de "🚀 Spaceship Titanic: A complete guide 🏆" pour préciser les données

In [1321]:
def fill_missing_home_planet_from_internet_solution(df):
    
    #LOOKING BY GROUP
    
    df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
    GHP_gb=df.groupby(['Group','HomePlanet'])['HomePlanet'].size().unstack().fillna(0)

    GHP_index=df[df['HomePlanet'].isna()][(df[df['HomePlanet'].isna()]['Group']).isin(GHP_gb.index)].index

    df.loc[GHP_index,'HomePlanet']=df.iloc[GHP_index,:]['Group'].map(lambda x: GHP_gb.idxmax(axis=1)[x])
    
    #LOOKING BY CABIN
    df.loc[(df['HomePlanet'].isna()) & (df['Cabin'][0] in ['A', 'B', 'C', 'T']), 'HomePlanet']='Europa'

    df.loc[(df['HomePlanet'].isna()) & (df['Cabin'][0]=='G'), 'HomePlanet']='Earth'
    
    #LOOKING AT LASTNAME
    df['Surname']=df['Name'].str.split().str[-1]
    SHP_gb=df.groupby(['Surname','HomePlanet'])['HomePlanet'].size().unstack().fillna(0)
    
    SHP_index=df[df['HomePlanet'].isna()][(df[df['HomePlanet'].isna()]['Surname']).isin(SHP_gb.index)].index

    df.loc[SHP_index,'HomePlanet']=df.iloc[SHP_index,:]['Surname'].map(lambda x: SHP_gb.idxmax(axis=1)[x])
    
    return df


### Remplissage des données manquantes de Planet destination

In [1322]:
def assign_destination_planet(home_planet):
    if home_planet == 'Earth':
        return random.choices(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e'], weights=[31, 7, 7])[0]
    elif home_planet == 'Europa':
        return random.choices(['TRAPPIST-1e', '55 Cancri e'], weights=[11, 8])[0]
    else :
        return 'TRAPPIST-1e'

In [1323]:
def fill_missing_destination_planet(df):
    # Récupération des éléments manquants dans la colonne Destination
    missing_destination_planet = df['Destination'].isnull()

    # Pour chaque élément manquant, on assigne une destination en fonction de la planète d'origine
    for idx in df[missing_destination_planet].index:
        home_planet = df.loc[idx, 'HomePlanet']
        df.loc[idx, 'Destination'] = assign_destination_planet(home_planet)
    return df

### Remplissage des données manquantes de Age

In [1324]:
def fill_missing_age(df):
    
    # Récupération des éléments manquants dans la colonne Age
    missing_age_mask = df['Age'].isnull()

    # Pour chaque élément manquant, on assigne un âge aléatoire entre 20 et 60 ans
    for idx in df[missing_age_mask].index:
        df.loc[idx, 'Age'] = random.randint(20, 60)
    
    # On regroupe les âges par tranche de 10 ans, chaque tranche d'age est représenté par une colonne
    for start_age in range(0, 100, 10):
        end_age = start_age + 9
        column_name = f'Age_{start_age}_{end_age}'
        df[column_name] = ((df['Age'] >= start_age) & (df['Age'] <= end_age)).astype(int)
    
    return df

### Remplissage des données manquantes Expenses

In [1325]:
def fill_missing_expenses(df):
    # Remplir les valeurs manquantes des colonnes de dépenses avec 0
    for col in EXPENSE_COLUMNS:
        df[col] = df[col].fillna(0)
    
    return df

# Fonction d'ajout des données

### Ajout des infos cabines

In [1326]:
def add_cabin_info(df):
    # Extraction des informations de la cabine

    df['Cabin_Deck'] = df['Cabin'].str[0]
    df['Cabin_Side'] = df['Cabin'].str[-1]

    # Les numéros de cabines sont groupés par tranche de 300
    for start in range(1, 2001, 300):
        end = start + 299
        column_name = f'Cabin_Number_{start}_{end}'
        df[column_name] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else np.nan)
        df[column_name] = ((df[column_name] >= start) & (df[column_name] <= end)).astype(int)

    return df

In [1327]:
def clean_data(df):

    # Ajout des elements manquants dans le dataframe
    df = add_total_expenses(df)
    df = fill_cryosleep(df)
    df = fill_missing_cabins(df)
    #df = fill_missing_home_planet(df)
    df = fill_missing_home_planet_from_internet_solution(df)
    df = fill_missing_destination_planet(df)
    df = fill_missing_age(df)
    df = fill_missing_expenses(df)
    
    # Ajout des données utiles
    
    df = add_cabin_info(df)
    
    # Transformation des données catégorielles en numériques
    
    df['HomePlanet'] = df['HomePlanet'].astype('category').cat.codes
    df['CryoSleep'] = df['CryoSleep'].astype(int)
    df['Cabin_Deck'] = df['Cabin_Deck'].astype('category').cat.codes
    df['Cabin_Side'] = df['Cabin_Side'].astype('category').cat.codes
    df['Destination'] = df['Destination'].astype('category').cat.codes
    df['Age'] = df['Age'].astype(int)

    # Suppression des colonnes inutiles
    if 'Transported' in df.columns:
        df = df.drop(columns=['PassengerId', 'Name', 'Cabin', 'Transported', 'Age', 'VIP', 'total_expenses', 'Group','Surname'])
    else:
        df = df.drop(columns=['PassengerId', 'Name', 'Cabin', 'Age', 'VIP', 'total_expenses', 'Group','Surname'])

    return df

# Nettoyage et affichage des données

In [1328]:
df_used_clean = clean_data(df_used)
df_test_clean = clean_data(df_test)

In [1329]:
df_used_clean.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Age_0_9,Age_10_19,Age_20_29,Age_30_39,Age_40_49,Age_50_59,Age_60_69,Age_70_79,Age_80_89,Age_90_99,Cabin_Deck,Cabin_Side,Cabin_Number_1_300,Cabin_Number_301_600,Cabin_Number_601_900,Cabin_Number_901_1200,Cabin_Number_1201_1500,Cabin_Number_1501_1800,Cabin_Number_1801_2100
0,1,0,2,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,2,109.0,9.0,25.0,549.0,44.0,0,0,1,0,0,0,0,0,0,0,5,1,0,0,0,0,0,0,0
2,1,0,2,43.0,3576.0,0.0,6715.0,49.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,1,0,2,0.0,1283.0,371.0,3329.0,193.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,2,303.0,70.0,151.0,565.0,2.0,0,1,0,0,0,0,0,0,0,0,5,1,1,0,0,0,0,0,0


In [1330]:
df_test_clean.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Age_0_9,Age_10_19,Age_20_29,Age_30_39,Age_40_49,Age_50_59,Age_60_69,Age_70_79,Age_80_89,Age_90_99,Cabin_Deck,Cabin_Side,Cabin_Number_1_300,Cabin_Number_301_600,Cabin_Number_601_900,Cabin_Number_901_1200,Cabin_Number_1201_1500,Cabin_Number_1501_1800,Cabin_Number_1801_2100
0,0,1,2,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,6,1,1,0,0,0,0,0,0
1,0,0,2,0.0,9.0,0.0,2823.0,0.0,0,1,0,0,0,0,0,0,0,0,5,1,1,0,0,0,0,0,0
2,1,1,0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0
3,1,0,2,0.0,6652.0,0.0,181.0,585.0,0,0,0,1,0,0,0,0,0,0,2,1,1,0,0,0,0,0,0
4,0,0,2,10.0,0.0,635.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,5,1,1,0,0,0,0,0,0


# Création et entrainement du modèle

In [1331]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=42)
rf.fit(df_used_clean, df_train['Transported'])
y_pred = rf.predict(df_test_clean)
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': y_pred})
submission.to_csv('submission.csv', index=False)

# Test rédigé avec Copilot (pour estimation)

In [1332]:
# Model optimization with cross-validation
from sklearn.model_selection import cross_val_score

# Test different parameters
param_combinations = [
    {'n_estimators': 1000, 'max_depth': 10},

]

best_score = 0
best_params = None

for params in param_combinations:
    rf_test = RandomForestClassifier(random_state=42, **params)
    scores = cross_val_score(rf_test, df_used_clean, df_train['Transported'], cv=5)
    avg_score = scores.mean()

    print(f"n_estimators={params['n_estimators']}, max_depth={params['max_depth']}: {avg_score:.4f} (+/- {scores.std() * 2:.4f})")

    if avg_score > best_score:
        best_score = avg_score
        best_params = params

n_estimators=1000, max_depth=10: 0.8004 (+/- 0.0336)
