## This is `Part 1` of the Analysis

In [1]:
import pandas as pd
import numpy as np
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
from skimpy import skim
skim(train)

## Extracting `Amount` spent for each passenger

In [2]:
train['Amount_Spent']=train['RoomService']+train['FoodCourt']+train['ShoppingMall']+train['Spa']+train['VRDeck']
test['Amount_Spent']=test['RoomService']+test['FoodCourt']+test['ShoppingMall']+test['Spa']+test['VRDeck']

pass_id_test=test['PassengerId'].values

train.drop(columns=['Name','PassengerId','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],inplace=True)
test.drop(columns=['Name','PassengerId','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],inplace=True)

## Before NaFill the data had following Attributes

In [3]:
train.columns

Index(['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP',
       'Transported', 'Amount_Spent'],
      dtype='object')

In [4]:
print(test['HomePlanet'].mode())
print(test['CryoSleep'].mode())
print(test['Cabin'].mode())
print(test['Destination'].mode())
print(test['Age'].mean())
print(test['VIP'].mode())
print(test['Amount_Spent'].mean())

0    Earth
Name: HomePlanet, dtype: object
0    False
Name: CryoSleep, dtype: object
0    G/160/P
Name: Cabin, dtype: object
0    TRAPPIST-1e
Name: Destination, dtype: object
28.65814620162446
0    False
Name: VIP, dtype: object
1441.7119309262166


In [5]:
def extract_last_word(text):
    if isinstance(text, str):
        return text.split('/')[-1]
    else:
        return None

# Apply function to 'cabin' column
train['Cabin'] = train['Cabin'].apply(extract_last_word)
test['Cabin'] = test['Cabin'].apply(extract_last_word)

## Mapping the `destination` column

The destination is transformed as  
TR      ->    TRAPPIST-1e      5915  
CAN     ->    55 Cancri e      1800  
PSO     ->    PSO J318.5-22     796

In [6]:
mapping = {
    "TRAPPIST-1e": "TRA",
    "55 Cancri e": "CAN",
    "PSO J318.5-22": "PSO"
}

# Apply the mapping
train['Destination'] = train['Destination'].map(mapping)
test['Destination'] = test['Destination'].map(mapping)

In [7]:
train['CryoSleep'].value_counts()

CryoSleep
False    5439
True     3037
Name: count, dtype: int64

## Imputing `HomePlanet`

In [8]:
train['HomePlanet']=train['HomePlanet'].fillna('Earth')
test['HomePlanet']=test['HomePlanet'].fillna('Earth')

## Imputing `CryoSleep`

In [9]:
train['CryoSleep']=train['CryoSleep'].fillna(False)
test['CryoSleep']=test['CryoSleep'].fillna(False)

  train['CryoSleep']=train['CryoSleep'].fillna(False)
  test['CryoSleep']=test['CryoSleep'].fillna(False)


## Imputing `Cabin`

In [10]:
train['Cabin']=train['Cabin'].fillna('P')
test['Cabin']=test['Cabin'].fillna('P')

## Imputing `Destination`

In [11]:
train['Destination']=train['Destination'].fillna('TRA')
test['Destination']=test['Destination'].fillna('TRA')

## Filling `Amount_Spent` Na Values

In [12]:
condition = (train['HomePlanet'] == 'Earth') & (train['Amount_Spent'].isna())
train.loc[condition, 'Amount_Spent'] = 708
condition = (train['HomePlanet'] == 'Europa') & (train['Amount_Spent'].isna())
train.loc[condition, 'Amount_Spent'] = 3552
condition = (train['HomePlanet'] == 'Mars') & (train['Amount_Spent'].isna())
train.loc[condition, 'Amount_Spent'] = 1074

condition = (test['HomePlanet'] == 'Earth') & (test['Amount_Spent'].isna())
test.loc[condition, 'Amount_Spent'] = 708
condition = (test['HomePlanet'] == 'Europa') & (test['Amount_Spent'].isna())
test.loc[condition, 'Amount_Spent'] = 3552
condition = (test['HomePlanet'] == 'Mars') & (test['Amount_Spent'].isna())
test.loc[condition, 'Amount_Spent'] = 1074

## Imputing `VIP`

In [13]:
import random
train['VIP']=train['VIP'].fillna(random.choice([True,False]))
test['VIP']=test['VIP'].fillna(random.choice([True,False]))

  train['VIP']=train['VIP'].fillna(random.choice([True,False]))
  test['VIP']=test['VIP'].fillna(random.choice([True,False]))


## Age Imputation|

Passengers travelling from `Earth` to `TRA` have mean age of `26`  
Passengers travelling from `Earth` to `CAN` have mean age of `23`  
Passengers travelling from `Earth` to `PSO` have mean age of `26`  


Passengers travelling from `Europa` to `TRA` have mean age of `34`  
Passengers travelling from `Europa` to `CAN` have mean age of `34`  
Passengers travelling from `Europa` to `PSO` have mean age of `36`


Passengers travelling from `Mars` to `TRA` have mean age of `29`  
Passengers travelling from `Mars` to `CAN` have mean age of `27`  
Passengers travelling from `Mars` to `PSO` have mean age of `36`

In [14]:
condition = (train['HomePlanet'] == 'Earth') & (train['Destination'] == 'TRA') & (train['Age'].isna())
train.loc[condition, 'Age'] = 26
condition = (train['HomePlanet'] == 'Earth') & (train['Destination'] == 'CAN') & (train['Age'].isna())
train.loc[condition, 'Age'] = 23
condition = (train['HomePlanet'] == 'Earth') & (train['Destination'] == 'PSO') & (train['Age'].isna())
train.loc[condition, 'Age'] = 26

condition = (train['HomePlanet'] == 'Mars') & (train['Destination'] == 'TRA') & (train['Age'].isna())
train.loc[condition, 'Age'] = 34
condition = (train['HomePlanet'] == 'Mars') & (train['Destination'] == 'CAN') & (train['Age'].isna())
train.loc[condition, 'Age'] = 34
condition = (train['HomePlanet'] == 'Mars') & (train['Destination'] == 'PSO') & (train['Age'].isna())
train.loc[condition, 'Age'] = 36

condition = (train['HomePlanet'] == 'Europa') & (train['Destination'] == 'TRA') & (train['Age'].isna())
train.loc[condition, 'Age'] = 29
condition = (train['HomePlanet'] == 'Europa') & (train['Destination'] == 'CAN') & (train['Age'].isna())
train.loc[condition, 'Age'] = 27
condition = (train['HomePlanet'] == 'Europa') & (train['Destination'] == 'PSO') & (train['Age'].isna())
train.loc[condition, 'Age'] = 36

In [15]:
condition = (test['HomePlanet'] == 'Earth') & (test['Destination'] == 'TRA') & (test['Age'].isna())
test.loc[condition, 'Age'] = 26
condition = (test['HomePlanet'] == 'Earth') & (test['Destination'] == 'CAN') & (test['Age'].isna())
test.loc[condition, 'Age'] = 23
condition = (test['HomePlanet'] == 'Earth') & (test['Destination'] == 'PSO') & (test['Age'].isna())
test.loc[condition, 'Age'] = 26

condition = (test['HomePlanet'] == 'Mars') & (test['Destination'] == 'TRA') & (test['Age'].isna())
test.loc[condition, 'Age'] = 34
condition = (test['HomePlanet'] == 'Mars') & (test['Destination'] == 'CAN') & (test['Age'].isna())
test.loc[condition, 'Age'] = 34
condition = (test['HomePlanet'] == 'Mars') & (test['Destination'] == 'PSO') & (test['Age'].isna())
test.loc[condition, 'Age'] = 36

condition = (test['HomePlanet'] == 'Europa') & (test['Destination'] == 'TRA') & (test['Age'].isna())
test.loc[condition, 'Age'] = 29
condition = (test['HomePlanet'] == 'Europa') & (test['Destination'] == 'CAN') & (test['Age'].isna())
test.loc[condition, 'Age'] = 27
condition = (test['HomePlanet'] == 'Europa') & (test['Destination'] == 'PSO') & (test['Age'].isna())
test.loc[condition, 'Age'] = 36

## The data before and after imputation are somewhat the same

## All Values `Imputed SuccessFully`

In [16]:
test.isnull().sum()

HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
Amount_Spent    0
dtype: int64

In [17]:
train.to_csv('train_imputed.csv',index=False)
test.to_csv('test_imputed.csv',index=False)

## Move to `predictions.ipynb` for prediction Part