In [None]:
import os
import warnings
from pathlib import Path


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.plotting.register_matplotlib_converters()
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')

# Data Loading and Cleaning

In [None]:
data_dir = Path("../input/titanic/")
train_data = pd.read_csv(data_dir / "train.csv")
test_data = pd.read_csv(data_dir / "test.csv")

def label_encoding(df):
    X = df.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    return X



def corrplot(df, method="pearson", annot=True, **kwargs):
    sns.clustermap(
        df.corr(method),
        vmin=-1.0,
        vmax=1.0,
        cmap="icefire",
        method="complete",
        annot=annot,
        **kwargs,
    )

    
#corrplot(train_data, annot=None)
X = train_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
X = label_encoding(X)

mean = X["Age"].mean()
std = X["Age"].std()
is_null = X["Age"].isnull().sum()
# compute random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)
# fill NaN values in Age column with random values generated
age_slice = X["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
X["Age"] = age_slice
X["Age"] = X["Age"].astype(int)
y= train_data['Survived']


In [None]:
X_test = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
X_test = label_encoding(X_test)
mean = X_test["Age"].mean()
std = X_test["Age"].std()
is_null = X_test["Age"].isnull().sum()
# compute random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)
# fill NaN values in Age column with random values generated
age_slice = X_test["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
X_test["Age"] = age_slice
X_test["Age"] = X_test["Age"].astype(int)
X_test.isnull().sum()
X_test = X_test.fillna(X_test.mean())

In [None]:
X.nlargest(10, ['Fare'])
X = X.drop([258,679,737])


In [None]:
y = y.drop([258,679,737])

# Feature Engineering

In [None]:
plt.figure(figsize=(14,6))
sns.pairplot(data=X)

**Grouping Age by SibSP**

In [None]:



X['Average_Age_by_SibSp'] = (X.groupby('SibSp')['Age'].transform('mean'))
X_test['Average_Age_by_SibSp'] = (X_test.groupby('SibSp')['Age'].transform('mean'))




# Model training

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100,max_depth=3,random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerID': test_data.PassengerId,
                      'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")