In [None]:
# preprocessing the test_df dataset

# fill missing values in 'embarked' with the  mode and convert to numerical
test_df['embarked'].fillna(test_df['embarked'].mode()[0], inplace=True)
test_df['embarked'] = test_df['embarked'].str.strip().str.upper()
test_df['embarked'] = test_df['embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# fill missing values in 'age' with the median and create age bands
test_df['age'].fillna(test_df['age'].median(), inplace=True)
test_df['age_band'] = pd.cut(test_df['age'], 5)
print(test_df[['age_band', 'survived']].groupby('age_band').mean())
test_df.loc[test_df['age'] <= 16, 'age'] = 0
test_df.loc[(test_df['age'] > 16) & (test_df['age'] <= 32), 'age'] = 1
test_df.loc[(test_df['age'] > 32) & (test_df['age'] <= 48), 'age'] = 2
test_df.loc[(test_df['age'] > 48) & (test_df['age'] <= 64), 'age'] = 3
test_df.loc[test_df['age'] > 64, 'age'] = 4
#ageband
#(0.34, 16.336]    0.550000
#(16.336, 32.252]  0.344168
#(32.252, 48.168]  0.404255
#(48.168, 64.084]  0.434783
#(64.084, 80.0]    0.090909

# convert categorical columns like 'sex' and 'embarked' to numerical
test_df['sex'] = test_df['sex'].str.strip().str.lower()
test_df['sex'] = test_df['sex'].map({'male': 0, 'female': 1})

# creating fare bands 
test_df['fare_band'] = pd.qcut(test_df['fare'], 4)
print(test_df['fare_band'].value_counts(sort=False))
print(test_df[['fare_band', 'survived']].groupby('fare_band').mean())
test_df.loc[test_df['fare'] <= 7.91, 'fare'] = 0
test_df.loc[(test_df['fare'] > 7.91) & (test_df['fare'] <= 14.45), 'fare'] = 1
test_df.loc[(test_df['fare'] > 14.45) & (test_df['fare'] <= 31.0), 'fare'] = 2
test_df.loc[test_df['fare'] > 31.0, 'fare'] = 3
# (-0.001, 7.91]     223
#(7.91, 14.454]     224
#(14.454, 31.0]     222
#(31.0, 512.329]    222

# extracting titles from names
test_df['title'] = test_df['name'].str.extract(' ([A-Za-z]+)\.', expand=False)
title_map = {
    "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs", 
    "Lady": "Rare", "Countess": "Rare", "Capt": "Rare",
    "Col": "Rare", "Don": "Rare", "Dr": "Rare", "Major": "Rare",
    "Rev": "Rare", "Sir": "Rare", "Jonkheer": "Rare", "Dona": "Rare"  }
test_df['title'] = test_df['title'].replace(title_map)
title_mapping = {'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Rare': 4}
test_df['title'] = test_df['title'].map(title_mapping).astype(int)
print(test_df['title'].value_counts())


# extracting passengers with family vs alone
test_df['family_size'] = test_df['sibsp'] + test_df['parch'] + 1
test_df['is_alone'] = 0
test_df.loc[test_df['family_size'] == 1, 'is_alone'] = 1
print(test_df[['family_size', 'survived']].groupby('family_size').mean())

# extracting initial letter of 'cabin' to check correlation with survival:            
test_df['cabin_letter'] = test_df['cabin'].str[0]
test_df['cabin_letter'] = test_df['cabin_letter'].fillna('U')      # Unknown
map = {char: idx for idx, char in enumerate(['U', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'])}
test_df['cabin_letter'] = test_df['cabin_letter'].map(map)
correlation = test_df['cabin_letter'].corr(test_df['survived'])
print("Correlation between cabin letter and survival:", correlation)

# group size 
test_df['ticket_group_size'] = test_df.groupby('ticket')['ticket'].transform('count')

#trying some permutations of the dataset
test_df['pclass_sex'] = test_df['pclass'] * test_df['sex']
test_df['isalone_age'] = test_df['is_alone'] * test_df['age']
test_df['title_fare'] = test_df['title'] * test_df['fare']
test_df['fare_perperson'] = test_df['fare'] / test_df['ticket_group_size']

test_df.groupby('pclass_sex')['survived'].mean()
test_df['isalone_age_bin'] = pd.cut(test_df['is_alone'] * test_df['age'], 5)
print(test_df.groupby('isalone_age_bin')['survived'].mean()) # after binning age
#(-0.004, 0.8]    0.505405              mostly young or non-alone people
#(0.8, 1.6]       0.291209              mostly young people who were alone
#(1.6, 2.4]       0.336449              mostly middle-aged people who were alone
#(2.4, 3.2]       0.292683              mostly older people who were not alone
#(3.2, 4.0]       0.111111              mostly older people who were alone

#test_df['isalone_age_qbin'] = pd.qcut(test_df['isalone_age'], q=5, duplicates='drop')
#print(test_df['isalone_age_qbin'].value_counts(sort=False))
#(-0.001, 1.0]    734
#(1.0, 4.0]       157


# dropping unnecessary columns
test_df.drop(['passengerid', 'name', 'ticket', 'age_band', 'fare_band', 'cabin', 'isalone_age_bin'], axis=1, inplace=True)

# saving the cleaned testing dataset
test_df.to_csv("data/test_cleaned.csv", index=False)