## Analysing dataset

In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np

In [43]:
train_raw = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

y_train_raw = train_raw.pop('Transported')
y_test = pd.read_csv('sample_submission.csv')
y_test = y_test['Transported']

In [44]:
train_raw['HomePlanet'].fillna('Earth', inplace = True)
train_raw['CryoSleep'].fillna(0, inplace = True)
train_raw['Destination'].fillna('TRAPPIST-1e', inplace = True)
train_raw['Age'].fillna(train_raw['Age'].median(), inplace = True)
train_raw['RoomService'].fillna(train_raw['RoomService'].median(), inplace = True)

#assuming all had food here and price = mean of available bills ie total - 0s - NaNs
tot = train_raw.shape[0]
train_raw['FoodCourt'].fillna((train_raw['FoodCourt'].mean() * tot / (tot - sum(train_raw['FoodCourt']==0) - sum(train_raw['FoodCourt'].isna()==True))), inplace = True)
train_raw['ShoppingMall'].fillna((train_raw['ShoppingMall'].mean() * tot / (tot - sum(train_raw['ShoppingMall']==0) - sum(train_raw['ShoppingMall'].isna()==True))), inplace = True)
train_raw['Spa'].fillna((train_raw['Spa'].mean() * tot / (tot - sum(train_raw['Spa']==0) - sum(train_raw['Spa'].isna()==True))), inplace = True)
train_raw['VRDeck'].fillna((train_raw['VRDeck'].mean() * tot / (tot - sum(train_raw['VRDeck']==0) - sum(train_raw['VRDeck'].isna()==True))), inplace = True)

In [45]:
train_raw['CryoSleep'] = train_raw['CryoSleep'].astype(str)

In [46]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8493 non-null   object 
dtypes: float64(6), object(7)
memory usage: 883.0+ KB


### Stratified Shuffle Split data to get CV set

In [47]:
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split the data into training and cross-validation sets
for train_index, cv_index in stratified_split.split(train_raw, y_train_raw):
    train, cv = train_raw.iloc[train_index], train_raw.iloc[cv_index]
    y_train, y_cv = y_train_raw.iloc[train_index], y_train_raw.iloc[cv_index]
    
train = train.reset_index()
y_train = y_train.reset_index()
cv = cv.reset_index()
y_cv = y_cv.reset_index()

In [48]:
cv

Unnamed: 0,index,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,3586,3857_01,Europa,True,E/230/P,55 Cancri e,34.0,False,0.0,0.0,521.127559,0.0,0.0,Eltard Dreamspric
1,7173,7650_05,Earth,True,G/1242/S,TRAPPIST-1e,4.0,False,0.0,0.0,0.000000,0.0,0.0,Tera Palmes
2,8559,9139_01,Mars,False,F/1766/S,TRAPPIST-1e,25.0,False,410.0,32.0,14.000000,1239.0,10.0,Poselk Paste
3,6528,6894_03,Mars,False,F/1319/S,TRAPPIST-1e,12.0,,0.0,0.0,0.000000,0.0,0.0,Risps Hacle
4,7934,8471_01,Europa,False,E/556/S,55 Cancri e,66.0,True,0.0,1828.0,1.000000,1873.0,45.0,Bosonon Metiveal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1734,3749,4008_01,Mars,True,F/760/S,TRAPPIST-1e,33.0,False,0.0,0.0,0.000000,0.0,0.0,Anamse Cha
1735,1637,1733_01,Earth,False,F/344/P,TRAPPIST-1e,15.0,False,1336.0,108.0,0.000000,0.0,0.0,Stevey Chanan
1736,5820,6158_01,Earth,True,G/996/S,PSO J318.5-22,14.0,False,0.0,0.0,0.000000,0.0,0.0,Dawne Jacobbinson
1737,5757,6102_01,Earth,False,F/1165/S,PSO J318.5-22,26.0,False,104.0,0.0,0.000000,280.0,216.0,Jilley Figuez


In [49]:
train = train.drop('Name', axis = 1)
train = train.drop('PassengerId', axis = 1)
train = train.drop('Cabin', axis = 1)
train = train.drop('index', axis = 1)
y_train = y_train.drop('index', axis = 1)
cv = cv.drop('index', axis = 1)
y_cv = y_cv.drop('index', axis = 1)

In [50]:
y_train = y_train.astype(str)
y_train

Unnamed: 0,Transported
0,True
1,True
2,False
3,True
4,False
...,...
6949,True
6950,False
6951,True
6952,True


Useless plot but resembles Earth's Magnetic field if plotted: train.Cabin.value_counts().plot(kind = 'pie')

In [None]:
cols = ["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
pd.plotting.scatter_matrix(train[cols], figsize = (10,12))

In [None]:
train.hist(bins = 25, figsize = (10,10))

In [None]:
train.HomePlanet.value_counts().plot(kind = 'pie')

In [None]:
train.CryoSleep.value_counts().plot(kind = 'barh')

In [None]:
train.Destination.value_counts().plot(kind = 'pie')

In [None]:
train.VIP.value_counts().plot(kind = 'barh')

In [None]:
for col in cols:
    print(col)
    print(sum(train[col].isna()==True))

## Feature columns

In [51]:
columns = train.columns.to_numpy()

categorial_columns = ["HomePlanet", "Destination", "VIP"]
numeric_columns = ["CryoSleep", "Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
feature_columns = []

for feature in categorial_columns:
    vocab = train[feature].unique()               #gets the unique values in the columns
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature,vocab))
    
for feature in numeric_columns:
    feature_columns.append(tf.feature_column.numeric_column(feature))

print(feature_columns)

[VocabularyListCategoricalColumn(key='HomePlanet', vocabulary_list=('Earth', 'Europa', 'Mars'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='Destination', vocabulary_list=('TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='VIP', vocabulary_list=(False, True, nan), dtype=tf.string, default_value=-1, num_oov_buckets=0), NumericColumn(key='CryoSleep', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='RoomService', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='FoodCourt', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='ShoppingMall', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Spa', shape=(1,)

## Input function

In [56]:
def make_input_fn(data_df, label_df, epochs = 10, shuffle = True, batch_size=32):
    def input_fn():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df),label_df))    #transforms pandas dataframe to tf.data.Dataset using input parameters and the y-labels.
        if shuffle:
            ds = ds.shuffle(1000)                                             #shuffles the data set
        ds = ds.batch(batch_size).repeat(epochs)                              #splits dataset into batches and repeats the process epoch no of times
        return ds
    return input_fn                                                           #returns the function object itself

## Creating the model

In [57]:
# classifier = tf.estimator.DNNClassifier(feature_columns = feature_columns, hidden_units = [30,10], n_classes = 2)
# There should be 2 Hidden layers with 30 and 10 neurons which need to classify an input among 2 fixed outputs
lin_est = tf.estimator.LinearClassifier(feature_columns = feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\NIKSHA~1\\AppData\\Local\\Temp\\tmpns4y102r', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [59]:
train_input_fn = make_input_fn(train, y_train)                      #trains the data set
# eval_input_fn = make_input_fn(cv, y_cv, epochs=1, shuffle = False)  #passing test data so no need to shuffle or have mult epochs

In [60]:
lin_est.train(train_input_fn)

TypeError: Expected binary or unicode string, got False

## Training the model

In [18]:
classifier.train(input_fn = lambda: input_fn(train, y_train), steps = 5000)
# Steps means go through the dataset untill 5000 inputs have been verified

Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


TypeError: Expected binary or unicode string, got False

In [None]:
result = classifier.evaluate(input_fn = lambda: input_fn(test, y_test,training = False))
# No steps as model needs to traverse only once and no training mode so no shuffling

## Evaluating the model