In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df = pd.read_csv('train.csv')
df.head()

## Descriptive Statistics

### Age

In [None]:
df['Age'].describe()

In [None]:
df['Age'].plot(kind='hist', bins=30, title='Age Distribution')

In [None]:
df['Age'].plot(kind='hist', title='Age Distribution')

In [None]:
df['Age'].plot(kind='kde', title='Age Distribution')

In [None]:
df['Age'].skew()

In [None]:
df['Age'].plot(kind='box', title='Age Distribution')

In [None]:
df[df['Age'] > 65]

In [None]:
df['Age'].isnull().sum()/len(df['Age'])


### Fare

In [None]:
df['Fare'].describe()

In [None]:
df['Fare'].plot(kind='hist', bins=30, title='Fare Distribution')

In [None]:
df['Fare'].plot(kind='kde', title='Fare Distribution')

In [None]:
df['Fare'].plot(kind='box', title='Fare Distribution')

In [None]:
df['Fare'].skew()

In [None]:
df[df['Fare'] > 200]

## Univariate Analysis

### Survived

In [None]:
df.head()

In [None]:
df['Survived'].value_counts()

In [None]:
df['Survived'].isnull().sum()

In [None]:
df['Survived'].value_counts().plot(kind='bar', title='Survival Count')

In [None]:
df['Survived'].value_counts().plot(kind='pie', autopct='%1.2f%%', title='Survival Count')

In [None]:
sns.countplot(x='Survived', data=df)


### PClass

In [None]:
df['Pclass'].value_counts()

In [None]:
df['Pclass'].value_counts().plot(kind='bar', title='Passenger Class Count')

In [None]:
df['Pclass'].value_counts().plot(kind='pie',autopct='%0.1f%%', title='Passenger Class Count')

### Sex

In [None]:
df['Sex'].value_counts()

In [None]:
df['Sex'].value_counts().plot(kind="bar", title='Sex Count')

In [None]:
df['Sex'].value_counts().plot(kind="pie",autopct ="%0.1f%%" ,title='Sex Count')

### Sibling/Spouse Aboard ( SibSp )

In [None]:
df['SibSp'].value_counts()

In [None]:
df['SibSp'].value_counts().plot(kind='bar', title='Sibling/Spouse Aboard Count')

In [None]:
df['SibSp'].value_counts().plot(kind='pie',autopct="%0.1f%%", title='Sibling/Spouse Aboard Count')

### Parents/Children Aboard (Parch)

In [None]:
df['Parch'].value_counts()

In [None]:
df['Parch'].value_counts().plot(kind='bar', title='Parents/Children Aboard Count')

In [None]:
df['Parch'].value_counts().plot(kind='pie',autopct='%1.2f%%',figsize=(8,8), title='Parents/Children Aboard Count')

### Embarked

In [None]:
df['Embarked'].value_counts()

In [None]:
df['Embarked'].value_counts().plot(kind='bar', title='Port of Embarkation Count')

In [None]:
df['Embarked'].value_counts().plot(kind='pie',autopct='%1.2f%%', title='Port of Embarkation Count')

## Bivariate Analysis

### sURVIVED COLUMN WITH OTHER COLUMNS

In [None]:
df.head()

In [None]:
df['Survived'] 

In [None]:
pd.crosstab( df['Survived'],df['Pclass'])

In [None]:
# Pclass 1  2   3 survivial percentage
pd.crosstab( df['Survived'],df['Pclass'],normalize='columns') * 100

In [None]:
sns.heatmap(pd.crosstab( df['Survived'],df['Pclass'],normalize='columns') * 100, annot=True, fmt='.2f')

In [None]:
pd.crosstab( df['Survived'],df['Sex'],normalize='columns') * 100


In [None]:
pd.crosstab( df['Survived'],df['Embarked'],normalize='columns') * 100

In [None]:
pd.crosstab( df['Sex'],df['Embarked'],normalize='columns') * 100

In [None]:
pd.crosstab( df['Pclass'],df['Embarked'],normalize='columns') * 100

In [None]:
# Survived and Age
df[df['Survived'] == 1]['Age'].plot(kind='kde', label='Survived', legend=True)
df[df['Survived'] == 0]['Age'].plot(kind='kde', label='Not Survived', legend=True)

### Featue Engineering Ideas

In [None]:
df1=pd.read_csv('test.csv')
df1.head()

In [None]:
df.describe()

In [None]:
df['SibSp'].value_counts()

In [None]:
df[df['SibSp'] == 8]

In [None]:
df[df["Ticket"] == 'CA. 2343']

In [None]:
df3 = pd.concat([df, df1])

In [None]:
df3[df3["Ticket"] == 'CA. 2343']

In [None]:
df3['Individual_Fare'] = df3['Fare']/(df3['SibSp'] + df3['Parch'] + 1)

In [None]:
df3['Individual_Fare'].plot(kind='box', title='Individual Fare Distribution')

In [None]:
df3['Family_size'] = df3['SibSp'] + df3['Parch'] + 1

In [None]:
df3.sample()

In [None]:
# Family Type alone samll large

def transform_family_size(size):
    if size == 1:
        return 'Alone'
    elif size <=4:
        return 'Small'
    else:
        return 'Large'

In [None]:
df3['Family_type'] = df3['Family_size'].apply(transform_family_size)


In [None]:
df3.sample(6)

## Bivariate Analysis

In [None]:
pd.crosstab( df3['Survived'],df3['Family_type'],normalize='columns') * 100

In [None]:
# df3["Name"].str.split(',').str[0]
df3["surname"] = df3["Name"].str.split(',').str.get(0)

In [None]:
df3["title"] = df3["Name"].str.split(',').str.get(1).str.strip().str.split(' ').str.get(0)


In [None]:
df3['title'].value_counts()

In [None]:
df3['title'] =df3['title'].str.replace('Mlle.', 'Miss.')
df3['title'] =df3['title'].str.replace('Ms.', 'Miss.')

df3['title'] =df3['title'].str.replace('Mme.', 'Mrs.')
df3['title'] =df3['title'].str.replace('Dona.', 'Mrs.')
df3['title'] =df3['title'].str.replace('Lady.', 'Mrs.')

df3['title'] =df3['title'].str.replace('Don.', 'Mr.')
df3['title'] =df3['title'].str.replace('Sir.', 'Mr.')
df3['title'] =df3['title'].str.replace('Jonkheer.', 'Mr.')

df3['title'] =df3['title'].str.replace(r'\bthe\b', 'other', regex=True)
df3['title'] =df3['title'].str.replace('Rev.', 'other')
df3['title'] =df3['title'].str.replace('Dr.', 'other')
df3['title'] =df3['title'].str.replace('Col.', 'other')
df3['title'] =df3['title'].str.replace('Major.', 'other')
df3['title'] =df3['title'].str.replace('Capt.', 'other')

In [None]:
df3['Cabin'].isnull().sum()/len(df3['Cabin'])

In [None]:
df3['Cabin'].fillna('M', inplace=True)

In [None]:
df3['Cabin'].describe()
df3['Cabin'].value_counts()

In [None]:
df3['deck'] = df3['Cabin'].str[0]

In [None]:
df3['deck'].value_counts()

In [None]:
pd.crosstab(df3['deck'], df3['Pclass'])

In [None]:
df3.select_dtypes(include='number').corr()['Survived']
sns.heatmap(df3.select_dtypes(include='number').corr(), annot=True, fmt='.2f')

In [None]:
df3.columns[df3.columns.duplicated()]
df3 = df3.reset_index(drop=True)

In [None]:
sns.pairplot(df3.select_dtypes(include='number'))

In [None]:
df3.to_csv("titanic_cleaned.csv", index=False)
