In [62]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [29]:
# Load the original dataset with original ages
original_df = pd.read_csv('titanic.csv')

# Load the pseudonymous dataset with anonymized ages
anonymous_df = pd.read_csv('titanic_anonymized.csv')

# Concatenate the two datasets along the columns
labeled_df = pd.concat([anonymous_df['age'], original_df['age']], axis=1)

# Save the labeled dataset
labeled_df.to_csv('labeled_dataset.csv', index=False)

In [63]:
# Load the labeled dataset with both pseudonymous and original 'age' values
labeled_dataset = pd.read_csv('labeled_dataset.csv')

In [64]:
# Label encode 'pseudonymous_age'
label_encoder = LabelEncoder()
labeled_dataset['pseudonymous_age'] = label_encoder.fit_transform(labeled_dataset['pseudonymous_age'])

In [72]:
# Handle NaN values in the target variable 'original_age'
mean_original_age = labeled_dataset['original_age'].mean()
labeled_dataset['original_age'].fillna(mean_original_age, inplace=True)

In [73]:
X = labeled_dataset[['pseudonymous_age']]
Y = labeled_dataset['original_age']

In [74]:
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [76]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, Y_train)

In [77]:
# Load the pseudonymous dataset with anonymized ages
anonymous_df = pd.read_csv('titanic_anonymized.csv')

In [79]:
# Label encode 'pseudonymous_age' in the pseudonymous dataset
anonymous_df['pseudonymous_age'] = label_encoder.transform(anonymous_df['age'])

In [80]:
# Predict the original ages for the pseudonymous ages in 'titanic_anonymized.csv'
anonymous_df['predicted_original_age'] = model.predict(anonymous_df[['pseudonymous_age']])

In [81]:
# Round the predicted values to integers (assuming 'age' is an integer)
anonymous_df['predicted_original_age'] = anonymous_df['predicted_original_age'].round().astype(int)

In [82]:
# Replace the 'age' column with the predicted values
anonymous_df['age'] = anonymous_df['predicted_original_age']

In [83]:
# Drop the temporary columns
anonymous_df.drop(columns=['pseudonymous_age', 'predicted_original_age'], inplace=True)

In [84]:
# Save the deanonymized dataset
anonymous_df.to_csv('titanic_age_deanonymized.csv', index=False)