In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import RobustScaler
import numpy as np

test_data=pd.read_csv("../data/titanic/test.csv")
#First, we create dictionaries of the most frequent values with types
#Shape of data
print("Shape of data ",test_data.shape)
#Most freauent values
for col in test_data.columns: print(test_data[col].value_counts())
#Dictionaries
dict_freq = {}
for col in test_data.columns:
    dict_freq[col] = {
        'value_counts': test_data[col].value_counts(),
        'dtype': test_data[col].dtype
    }
#Set of types
print("Set of dtypes: ",set([dict_freq[i]['dtype'] for i in dict_freq])) #type "O" - object

Shape of data  (418, 11)
PassengerId
892     1
893     1
894     1
895     1
896     1
       ..
1305    1
1306    1
1307    1
1308    1
1309    1
Name: count, Length: 418, dtype: int64
Pclass
3    218
1    107
2     93
Name: count, dtype: int64
Name
Kelly, Mr. James                                1
Wilkes, Mrs. James (Ellen Needs)                1
Myles, Mr. Thomas Francis                       1
Wirz, Mr. Albert                                1
Hirvonen, Mrs. Alexander (Helga E Lindqvist)    1
                                               ..
Spector, Mr. Woolf                              1
Oliva y Ocana, Dona. Fermina                    1
Saether, Mr. Simon Sivertsen                    1
Ware, Mr. Frederick                             1
Peter, Master. Michael J                        1
Name: count, Length: 418, dtype: int64
Sex
male      266
female    152
Name: count, dtype: int64
Age
21.0    17
24.0    17
22.0    16
30.0    15
18.0    13
        ..
44.0     1
5.0      1
51.0     1

In [13]:
dict_freq_num = {}
for col in test_data.columns:
    dict_freq_num[col] = {
        'value_counts': test_data[col].nunique(),
        'dtype': test_data[col].dtype
    }
dict_freq_num 

{'PassengerId': {'value_counts': 418, 'dtype': dtype('int64')},
 'Pclass': {'value_counts': 3, 'dtype': dtype('int64')},
 'Name': {'value_counts': 418, 'dtype': dtype('O')},
 'Sex': {'value_counts': 2, 'dtype': dtype('O')},
 'Age': {'value_counts': 79, 'dtype': dtype('float64')},
 'SibSp': {'value_counts': 7, 'dtype': dtype('int64')},
 'Parch': {'value_counts': 8, 'dtype': dtype('int64')},
 'Ticket': {'value_counts': 363, 'dtype': dtype('O')},
 'Fare': {'value_counts': 169, 'dtype': dtype('float64')},
 'Cabin': {'value_counts': 76, 'dtype': dtype('O')},
 'Embarked': {'value_counts': 3, 'dtype': dtype('O')}}

In [14]:
#Since there are many unique values for age, as well as omissions, they should be grouped based on class and gender
test_data['Age'].fillna(test_data.groupby(['Pclass','Sex'])['Age'].transform('median'), inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
#since there are many cabins, it is easier to create a sign that there is a booth or not
test_data['Cabin'].value_counts()
#First, let's try with a sign that means that there are relatives in general
test_data['Any_relatives'] = ((test_data['SibSp'] > 0) | (test_data['Parch'] > 0)).astype(int)

In [15]:
#deck
test_data['Deck'] = test_data['Cabin'].str[0].fillna('U')
#groups of deck
deck_groups = {
    1: ['A', 'B', 'C'],
    2: ['D', 'E'],
    3: ['F', 'G', 'T']
}
for group, decks in deck_groups.items():
    test_data[f'Deck_{group}'] = test_data['Deck'].isin(decks).astype(int)
#extract numbers
def extract_numbers(cabin):
    if pd.isna(cabin):
        return []
    return [int(x) for x in re.findall(r'\d+', cabin)]
test_data['CabinNumbers'] = test_data['Cabin'].apply(extract_numbers)

In [16]:
#if the list is empty, then 0. if 1 booth - immediately into the numbers, if not at all - the average
def compute_cabin_value(numbers):
    if len(numbers) == 0:
        return 0
    elif len(numbers) == 1:
        return numbers[0]
    else:
        return sum(numbers) / len(numbers)
test_data['CabinValue'] = test_data['CabinNumbers'].apply(compute_cabin_value)

In [17]:
#we are looking for which category the cabin belongs to
def cabin_zone(value):
    if value == 0:
        return 0
    elif value < 20:
        return 1
    elif value < 50:
        return 2
    elif value < 100:
        return 3
    else:
        return 4
test_data['CabinZone'] = test_data['CabinValue'].apply(cabin_zone)
test_data = test_data.drop(['SibSp','Parch','Cabin','Deck','CabinNumbers','CabinValue','Ticket'],axis=1)

In [18]:
def extract_title(name):
    match = re.search(r' ([A-Za-z]+)\.', name)
    if match:
        return match.group(1)
    return 'Unknown'
test_data['Title'] = test_data['Name'].apply(extract_title)
title_mapping = {
    'Mme': 'Mrs',
    'Mlle': 'Miss',
    'Ms': 'Miss'
}
test_data['Title'] = test_data['Title'].replace(title_mapping)
common_titles = ['Mr', 'Mrs', 'Miss', 'Master']
test_data['Title'] = test_data['Title'].apply(lambda x: x if x in common_titles else 'Rare')
test_data = test_data.drop('Name',axis=1)

In [19]:
test_data['Sex'] = test_data['Sex'].map({'male': 1, 'female': 0})
embarked_dummies = pd.get_dummies(test_data['Embarked'], prefix='Embarked', drop_first=True, dtype=int)
test_data = pd.concat([test_data, embarked_dummies], axis=1)
title_dummies = pd.get_dummies(test_data['Title'], prefix='Title', drop_first=True, dtype=int)
test_data = pd.concat([test_data, title_dummies], axis=1)
test_data.drop(['Embarked', 'Title'], axis=1, inplace=True)

pclass_dummies = pd.get_dummies(test_data['Pclass'], prefix='Pclass', drop_first=True, dtype=int)
test_data = pd.concat([test_data, pclass_dummies], axis=1)
test_data.drop(['Pclass'], axis=1, inplace=True)

In [20]:
scaler_age = RobustScaler()
test_data['Age'] = scaler_age.fit_transform(test_data[['Age']])

test_data['Fare'] = np.log1p(test_data['Fare'])
scaler_fare = RobustScaler()
test_data['Fare'] = scaler_fare.fit_transform(test_data[['Fare']])

In [21]:
from scipy.stats import shapiro

stat_age, p_age = shapiro(test_data['Age'])
print(f"Age: W={stat_age:.4f}, p-value={p_age:.4f}")
if p_age > 0.05:
    print("Age can be considered normally distributed (p > 0.05)")
else:
    print("Age differs significantly from the normal distribution (p <= 0.05)")

stat_fare, p_fare = shapiro(test_data['Fare'])
print(f"Fare: W={stat_fare:.4f}, p-value={p_fare:.4f}")
if p_age > 0.05:
    print("Fare can be considered normally distributed (p > 0.05)")
else:
    print("Fare differs significantly from the normal distribution (p <= 0.05)")

Age: W=0.9387, p-value=0.0000
Age differs significantly from the normal distribution (p <= 0.05)
Fare: W=nan, p-value=nan
Fare differs significantly from the normal distribution (p <= 0.05)


In [22]:
corr_matrix = test_data.corr()
mask = (corr_matrix.abs() >= 0.7) & (corr_matrix != 1)
vars_with_high_corr = mask.any(axis=1) | mask.any(axis=0) #don't write or
corr_filtered = corr_matrix.loc[vars_with_high_corr, vars_with_high_corr]
print(corr_filtered)
print(corr_filtered.columns)
test_data=test_data.drop(corr_filtered.columns, axis=1)

                Sex    Deck_1  CabinZone  Title_Mr
Sex        1.000000 -0.130234  -0.129057  0.877762
Deck_1    -0.130234  1.000000   0.806616 -0.130396
CabinZone -0.129057  0.806616   1.000000 -0.125553
Title_Mr   0.877762 -0.130396  -0.125553  1.000000
Index(['Sex', 'Deck_1', 'CabinZone', 'Title_Mr'], dtype='object')


In [23]:
test_data.to_csv('../data/titanic/new_test_data.csv', index=False)

Without visualization, because the training data doesn't contain survive