# This file stores a function destined to perform feature engineering
# on the Titanic dataset. In this script, the general FE processes to be
# applied to the dataset are defined and explained. 

# The goal is to provide clarity to the final script allowing to predict
# whether or not a passenger of the Titanic has survived or not, whilst 
# providing multiple feature engineering paths to explore. The chosen
# final FE process will retain that providing the best accuracy in the 
# classification exercise.

In [None]:
def titanic_feature_eng(data, drop_all, cabin_clustering):

    from sklearn.preprocessing import LabelEncoder, StandardScaler
    from sklearn.decomposition import PCA

    # Dropping the 'Ticket', 'Name', 'Embarked', 'Parch' and 'SibSp' column.
    if drop_all:
        data = data.drop(['Ticket', 'Name', 'Embarked', 'Parch', 'SibSp'], axis = 1)


    # Binary Encoding of the 'Sex' Column'
    data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})


    # Replacing the missing values in the 'Age' column by the mean of 
    # the column.
    data['Age'] = data['Age'].fillna(data['Age'].mean())


    # Keeping only the first letter of the first Cabin in which the pas-
    # senger / family member is located in. This choice is justified by
    # the layout of cabins on the Titanic, with cabins in alphabetical 
    # order representing decks from the top to the bottom of the ship.
    print(data['Cabin'])
    data['Cabin'] = data['Cabin'].apply(lambda x: x.split(' ')[0][0] )

    # Label encoding of the cabins.
    original_cabins = data['Cabin']
    data['Cabin'] = label_encoder.fit_transform(data['Cabin'])


    # Standardising the 'Fare' and 'Age' columns.
    standardized_columns = ['Age', 'Fare']
    data[standardized_columns] = scaler.fit_transform(data[standardized_columns]) 
    
    if 'Survived' not in data.columns:

        print('The dataset given is the training set.')


    ### --- PERFORMING CABIN CLUSTERING  --- ###

        if cabin_clustering:

            # Performing a PCA on the data set, reducing the number of 
            # features to 3. Note that only the passengers having a 
            # cabin value are considered here in order to 
            pca = PCA(n_components = 3)
            X_pca = pca.fit_transform(data)
            print(data.columns)
            # Initialising the KMeans algorithm and using the elbow method to 
            # determine the optimal number of clusters.

            inertia = []

            for i in range(1,15):
                KM = KMeans(init = 'k-means++', n_clusters = i, random_state = 42)
                train_results = KM.fit(X_pca)
                inertia.append(KM.inertia_)

            # Plotting the cluster inertia vs number of clusters.
            plt.figure()
            sns.lineplot(x = list(range(1,15)), y = inertia, marker = 'o')


            # After having decided upon the number of clusters to retain
            # a second KMeans clustering is performed.

            KM = KMeans(init = 'k-means++', n_clusters = 2, random_state = 42)
            train_results = KM.fit(X_pca)

            data['CabinLetter'] = data_labels

            # Creating a crosstab
            corrtable = pd.crosstab(data['cluster'], data['CabinLetter'], normalize=True)

            # Now, normalize the crosstab per Cabin Letter group
            corrtable_normalized = corrtable.div(corrtable.sum(axis=0), axis=1)
            print(corrtable_normalized) 

        # When 2 clusters are created it can be seen that the groups
        # can be created, one with cabins 'A-B-C' and the other with
        # the rest of the cabins, making it easier to impute the 
        # missing cabins thereafter.
        

        ### --- Encoding the Cabin Groups --- ###


    if cabin_group_encoding:
        
        # Defining a function to encode the cabins into 0 and 1s.

        def cabin_group_encoding_fun(data):
            cabin_groups = [['A', 'B', 'C'], ['D', 'E', 'F', 'G', 'T']]
            
            if data in cabin_groups[0]:
                x = 0
            elif data in cabin_groups[1]:
                x = 1

            return x

        data['Cabin'] = data['CabinLetter'].apply(cabin_group_encoding_fun)

    

    ### --- DEFINING A CABIN IMPUTING METHOD --- ### MAYBE DEFINE A FUNCTION FOR THIS ALONE?

    # 1. Based on the closest average fare Price
    # 2. Using a classification model

    
            
    return data, original_cabins

SyntaxError: invalid syntax (1118791389.py, line 98)