In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('Titanic/train.csv')
columns_to_eliminate = ['PassengerId','Name','Ticket','Cabin']
train_data.drop(columns_to_eliminate, axis=1, inplace=True)
train_data['Sex'] = train_data['Sex'].apply(lambda x: 0 if x=='male' else 1)
train_data['Embarked'] = train_data['Embarked'].apply(lambda x: 0 if x == 'S' else 1 if x == 'C' else 2)

In [83]:
#for column in train_data.columns:
#    print(column, train_data[column].isna().sum())

## Data cleaning and preprocessing

### Data Cleaning

In [3]:
train_data['Embarked'].dropna(inplace=True)
train_data.query('Parch <= 2', inplace=True)
train_data.query('Fare <= 100', inplace=True)

In [4]:
age_na_train = train_data[train_data['Age'].isna()]
X = train_data[['Sex', 'Pclass', 'Fare' ,'Age']].dropna()
y = X['Age'].astype(int)
X = X.drop('Age', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Filling the NA using a model

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [6]:
linear_regression_age = LinearRegression().fit(X_train, y_train)
random_forest_age = RandomForestRegressor().fit(X_train, y_train)

In [7]:
LR_predictions = linear_regression_age.predict(X_test)
RF_predictions = random_forest_age.predict(X_test)

In [8]:
rmse_lr = np.sqrt(mean_squared_error(y_test, LR_predictions))
rmse_rf = np.sqrt(mean_squared_error(y_test, RF_predictions))

In [9]:
rmse_lr, rmse_rf

(15.165118157681931, 15.843187422267643)

### Filling the NA using mean

In [10]:
sexes = [0, 1]
classes = [1,2,3]

aux_dataframe = pd.DataFrame()
not_na_train = train_data.dropna()
for sex in sexes:
    for class_ in classes:
        #Need to find out the index of the 
        aux_data = not_na_train.query('Pclass == '+str(class_)+' & Sex == '+str(sex))
        mean = aux_data['Age'].mean(skipna=True)
        aux_data.fillna(int(mean), inplace=True)
        aux_dataframe = pd.concat([aux_dataframe, aux_data])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aux_data.fillna(int(mean), inplace=True)


In [11]:
rules_rmse = np.sqrt(mean_squared_error(not_na_train['Age'], aux_dataframe['Age']))

In [12]:
rules_rmse

20.461619623269367

### Using the LR predicted values to fill the missing ages

In [13]:
age_na_train = train_data[train_data['Age'].isna()]
X = age_na_train[['Sex', 'Pclass', 'Fare']]
age_na_train['Age'] = linear_regression_age.predict(X).astype(int)
clean_train = pd.concat([age_na_train,train_data.dropna()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_na_train['Age'] = linear_regression_age.predict(X).astype(int)


### Addressing the imbalance of Survivors/Deceased:

In [14]:
y = clean_train['Survived']
X = clean_train.drop('Survived', axis=1)
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)

### Scaling the data

In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_over)
MinMaxScaler()
scaled_features = scaler.transform(X_over)

## Training the binary classification models

In [16]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y_over, test_size=0.2, random_state=42)
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [45]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Accuracy:',accuracy_score(y_test, y_pred)*100,'%')
print('Recall:', recall_score(y_test, y_pred)*100,'%')
print('Precision:', precision_score(y_test, y_pred)*100,'%')
print('F1 Score:', f1_score(y_test, y_pred)*100,'%')

Accuracy: 77.61904761904762 %
Recall: 72.11538461538461 %
Precision: 80.64516129032258 %
F1 Score: 76.14213197969544 %


In [47]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Accuracy:',accuracy_score(y_test, y_pred)*100,'%')
print('Recall:', recall_score(y_test, y_pred)*100,'%')
print('Precision:', precision_score(y_test, y_pred)*100,'%')
print('F1 Score:', f1_score(y_test, y_pred)*100,'%')

Accuracy: 86.19047619047619 %
Recall: 88.46153846153845 %
Precision: 84.40366972477065 %
F1 Score: 86.3849765258216 %


In [48]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2) 
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)
print('Accuracy:',accuracy_score(y_test, y_pred)*100,'%')
print('Recall:', recall_score(y_test, y_pred)*100,'%')
print('Precision:', precision_score(y_test, y_pred)*100,'%')
print('F1 Score:', f1_score(y_test, y_pred)*100,'%')

Accuracy: 86.19047619047619 %
Recall: 88.46153846153845 %
Precision: 84.40366972477065 %
F1 Score: 86.3849765258216 %


## Preparing the testing data

In [59]:
test_data = pd.read_csv('Titanic/test.csv')
columns_to_eliminate = ['Name','Ticket','Cabin']
test_data.drop(columns_to_eliminate, axis=1, inplace=True)
test_data['Sex'] = test_data['Sex'].apply(lambda x: 0 if x=='male' else 1)
test_data['Embarked'] = train_data['Embarked'].apply(lambda x: 0 if x == 'S' else 1 if x == 'C' else 2)
test_data['Embarked'].fillna('2', inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

  test_data['Embarked'].fillna('2', inplace=True)


In [60]:
age_na_test = test_data[test_data['Age'].isna()]
age_na_test_passenger_id = age_na_test['PassengerId']
age_na_test = age_na_test[['Sex', 'Pclass', 'Fare' ,'Age']]
full_age_test = test_data.dropna()


X = age_na_test.drop('Age', axis=1)
age_na_test['Age'] = linear_regression_age.predict(X).astype(int)
age_na_test['PassengerId'] = age_na_test_passenger_id
prepared_test = pd.concat([age_na_test, full_age_test])
prepared_test['Age'] = prepared_test['Age'].astype(int)


prepared_test = prepared_test.set_index('PassengerId').join(
    test_data[['PassengerId', 'SibSp', 'Parch', 'Embarked']].set_index('PassengerId'), 
    lsuffix='_l', 
    rsuffix='', 
    on='PassengerId')


prepared_test.drop(['SibSp_l', 'Parch_l', 'Embarked_l'], axis = 1, inplace=True)


scaled_test = scaler.transform(prepared_test[clean_train.columns[1:]])


final_dataset = pd.DataFrame()
final_dataset['PassengerId'] = prepared_test.index
final_dataset['Survived'] = rf.predict(scaled_test)


final_dataset.sort_values(by=['PassengerId'], inplace=True)


final_dataset.to_csv('Titanic/results.csv', index=False)