In [47]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/titanic'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing the essential modules**

In [48]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from IPython.display import display

In [49]:
train = pd.read_csv("../input/titanic/train.csv") 
test = pd.read_csv("../input/titanic/test.csv")

# **Display Top 10 data of train and test**

In [50]:
display(train.head(10))

In [51]:
display(test.head(10))

In [52]:
train.info()

In [53]:
test.info()

In [54]:
train.describe(include = "all")

In [55]:
test.describe(include = "all")

In [56]:
#number of column in test and train
print("Number of column in train:",train.shape[1], " Number of column in test:", test.shape[1])

In [57]:
#number of rows in train and test
print("Number of rows in train:",train.shape[0], "Number of rows in test:",test.shape[0])

# **Data Cleaning**
1. Removing null values
2. Removing unwanted column
3. Formatting the column
4. Added new column according to need

In [58]:
def null_value(df):
    percentage = ((df.isna().sum()/df.isna().count())*100).sort_values(ascending = False)
    count = (df.isna().sum()).sort_values(ascending = False)
    diff= pd.concat([count,percentage],axis = 1, keys = ["Count", "Percentage"])
    return diff

In [59]:
null_value(train)

In [60]:
null_value(test)

In [61]:
#dropping the column cabin because it has more percentage of null and the cabin can be found from the fare rate and pclass
train.drop("Cabin",axis = 1,inplace = True)
test.drop("Cabin",axis=1, inplace = True)

In [62]:
#fill up or replace the null in the age with the mean of age
train["Age"].replace(np.nan, train["Age"].mean(), inplace = True)
train["Embarked"].replace(np.nan, train["Embarked"].mode()[0], inplace = True)
train["Age"].isna().sum(), train["Embarked"].isna().sum()

In [63]:
test["Age"].replace(np.nan, test["Age"].mean(),inplace = True)
test["Fare"].replace(np.nan, test["Fare"].mode()[0],inplace = True)
test["Age"].isna().sum(), test["Fare"].isna().sum()

In [64]:
#Dropping of the Name and the passenger ticket because both has no recurring pattern
train.drop(["Name","Ticket"], axis =1, inplace = True)
test.drop(["Name","Ticket"],axis = 1, inplace = True)

**Visualization of Sex and the Survival count**

In [65]:
dd = train[["Sex","Survived"]].groupby(["Survived"], as_index = False).count()
dd.plot(kind = "bar",figsize = (10,7))
plt.xlabel("Sex")
plt.ylabel("Survived")
plt.show()

In [66]:
dd = train[["Embarked","Survived"]].groupby(["Survived"], as_index = False).count()
dd.plot(kind = "bar",figsize = (10,7))
plt.xlabel("Embarked")
plt.ylabel("Survived")
plt.show()

As we can see that both **Sex** and **Embarked** shows a great dependacy/relationship to the target.

In [67]:
#formating the columns of Sex and Embarked
train = pd.get_dummies(train, prefix = ["Sex","Embarked"])
test = pd.get_dummies(test, prefix = ["Sex","Embarked"])

In [68]:
train

In [69]:
#combining the SibSp and Parch
def merge(df):
    rel = []
    for i in df["SibSp"].values.tolist():
        rel.append(i)
    rel1 = []
    for i in df["Parch"].values.tolist():
        rel1.append(i)
        
    concat = []
    for index in range(len(rel)):
        concat.append(rel[index] + rel1[index])
        
    df1 = pd.DataFrame(concat, columns = ["Relatives"])
    df = pd.concat([df,df1], axis = 1)
    return df

In [70]:
train = merge(train)

In [71]:
train.drop(["SibSp","Parch"],axis=1, inplace = True)

In [72]:
train.loc[888]

In [73]:
test = merge(test)
test.drop(["SibSp","Parch"],axis = 1, inplace = True)

In [74]:
test.head()

# **Feature Importance using ExtraTreeClassifier**

In [75]:
x = train.drop("Survived",axis = 1)
y = train["Survived"]

In [76]:
#model object creation and fitting
model=ExtraTreesClassifier()
model.fit(x,y)

In [77]:
ft = pd.Series(model.feature_importances_, index = x.columns)
ft.plot(kind = "barh")
plt.show()

In [78]:
#heatmap for correlation between feature
plt.figure(figsize = (20,15))
sns.heatmap(train.corr(),annot = True)
plt.show()

In [79]:
#Dropping passengerId as we can see the correlation of passengerId is near to zero with respect to Survived
train.drop("PassengerId",axis = 1,inplace = True)

# **Data Preprocessing**

In [80]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 10)
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)
test = StandardScaler().fit_transform(test)

**Logistic Regression**

In [81]:
lr = LogisticRegression()
params = { "penalty": ("l1", "l2", "elasticnet"), "tol": (0.1, 0.01, 0.001, 0.0001), "C": (10.0, 1.0, 0.1, 0.01)}
modelLR = GridSearchCV(lr, params, cv=10)
modelLR.fit(X_train, y_train)

In [82]:
print(accuracy_score(modelLR.predict(X_test),y_test))

**Support Vector Machine(SVM)**

In [83]:
modelSVM = svm.SVC(kernel = "rbf")
modelSVM.fit(X_train,y_train)

In [84]:
print(accuracy_score(modelSVM.predict(X_test),y_test))

**DecisionTreeClassifier**

In [85]:
modelDTC = DecisionTreeClassifier(criterion="entropy")
modelDTC.fit(X_train,y_train)

In [86]:
print(accuracy_score(modelDTC.predict(X_test),y_test))

**KNeighborsClassifier**

In [87]:
modelKNC = KNeighborsClassifier(n_neighbors=4)
modelKNC.fit(X_train, y_train)

In [88]:
print(accuracy_score(modelKNC.predict(X_test),y_test))

In [90]:
report = pd.DataFrame({
    "Model": ["LogisticRegression","SVM","DecisionTreeClassifier","KNeighborsClassifier"],
    "Accuracy": [accuracy_score(modelLR.predict(X_test),y_test),accuracy_score(modelSVM.predict(X_test),y_test),
                  accuracy_score(modelDTC.predict(X_test),y_test),accuracy_score(modelKNC.predict(X_test),y_test)]
})

In [91]:
report.sort_values(by = "Accuracy")

# Test Data Prediction

**Using SVM as it has a better accuracy than other**

In [93]:
pred = modelSVM.predict(test)

In [95]:
x = pd.read_csv("../input/titanic/test.csv")
prediction = pd.DataFrame({"Passenger ID":x["PassengerId"], "Survived": pred})

In [97]:
prediction.sample(10)

### You can also use feature selection module from [scikit-learn](https://scikit-learn.org/stable/modules/feature_selection.html) to know the best feature for better accuracy.
### Such as: SelectKBest -> Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator

>**Inspired by one of the contributor of this competition**