In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import matplotlib.pyplot as plt
plt.style.use("seaborn-whitegrid")
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [6]:
r =  pd.read_csv('/kaggle/input/titanic/train.csv')
e =  pd.read_csv('/kaggle/input/titanic/test.csv')

In [7]:
r.columns

In [8]:
r.head()

In [9]:
r.describe()

In [10]:
r.info()

### Univariate Variable Analysis

#### Categories

In [11]:
def bar(va):
    v = r[va]
    vv = v.value_counts()
    
    plt.figure(figsize = (9,4))
    plt.bar(vv.index, vv)
    plt.xticks(vv.index, vv.index.values)
    plt.ylabel('frequency')
    plt.title(va)
    plt.show()
    
    print('{}: \n{}'.format(va,vv))

In [12]:
v = ['Survived','Sex','Pclass','Embarked','SibSp','Parch']

for c in v:
    bar(c)

In [13]:
c1 = ['Cabin','Name','Ticket']

for c in c1:
    print('{} \n'.format(r[c].value_counts()))

#### Numbers

In [14]:
def hist(v):
    plt.figure(figsize = (9,3))
    plt.hist(r[v], bins = 50)
    plt.xlabel(v)
    plt.ylabel('frequency')
    plt.title('histogram for {}'.format(v))

In [16]:
n = ['Fare','Age','PassengerId']

for n in n:
    hist(n)

### Data Analysis

In [18]:
def da(x):
    return r[[x,"Survived"]].groupby([x], as_index = False).mean().sort_values(by="Survived",ascending = False)

In [19]:
da('Pclass')

In [20]:
da('SibSp')

In [21]:
da('Parch')

### Outlier Detection

In [22]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        Q1 = np.percentile(df[c],25)
        Q3 = np.percentile(df[c],75)
        IQR = Q3 - Q1
        outlier_step = IQR * 1.5
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [23]:
r.loc[detect_outliers(r,["Age","SibSp","Parch","Fare"])]

In [24]:
r = r.drop(detect_outliers(r,["Age","SibSp","Parch","Fare"]),axis = 0).reset_index(drop = True)

### Missing value

In [25]:
rl = len(r)
r = pd.concat([r,e],axis = 0).reset_index(drop = True)

In [26]:
r.head()

In [27]:
r.columns[r.isnull().any()]

In [28]:
r.isnull().sum()

In [29]:
r[r["Embarked"].isnull()]

In [30]:
r.boxplot(column="Fare",by = "Embarked")
plt.show()

In [31]:
r["Embarked"] = r["Embarked"].fillna("C")
r[r["Embarked"].isnull()]

In [33]:
r[r["Fare"].isnull()]

In [34]:
r["Fare"] = r["Fare"].fillna(np.mean(r[r["Pclass"] == 3]["Fare"]))

In [35]:
r[r["Fare"].isnull()]

### Visualization

In [37]:
list1 = ["SibSp", "Parch", "Age", "Fare", "Survived"]
sns.heatmap(r[list1].corr(), annot = True, fmt = ".2f");

In [39]:
g = sns.factorplot(x = "SibSp", y = "Survived", data = r, kind = "bar", size = 6)
g.set_ylabels("Survived Probability");

In [41]:
g = sns.factorplot(x = "Parch", y = "Survived", kind = "bar", data = r, size = 6)
g.set_ylabels("Survived Probability");

In [42]:
g = sns.factorplot(x = "Pclass", y = "Survived", data = r, kind = "bar", size = 6)
g.set_ylabels("Survived Probability");

In [44]:
g = sns.FacetGrid(r, col = "Survived")
g.map(sns.distplot, "Age", bins = 25);

In [46]:
g = sns.FacetGrid(r, col = "Survived", row = "Pclass", size = 4)
g.map(plt.hist, "Age", bins = 25)
g.add_legend();

In [48]:
g = sns.FacetGrid(r, row = "Embarked", size = 4)
g.map(sns.pointplot, "Pclass","Survived","Sex")
g.add_legend();

In [50]:
g = sns.FacetGrid(r, row = "Embarked", col = "Survived", size = 4)
g.map(sns.barplot, "Sex", "Fare")
g.add_legend();

In [52]:
r[r["Age"].isnull()]

In [53]:
sns.factorplot(x = "Sex", y = "Age", data = r, kind = "box");

In [55]:
sns.factorplot(x = "Sex", y = "Age", hue = "Pclass",data = r, kind = "box");

In [57]:
sns.factorplot(x = "Parch", y = "Age", data = r, kind = "box");

In [58]:
sns.factorplot(x = "SibSp", y = "Age", data = r, kind = "box");

In [59]:
sns.heatmap(r[["Age","Sex","SibSp","Parch","Pclass"]].corr(), annot = True);

In [60]:
index_nan_age = list(r["Age"][r["Age"].isnull()].index)
for i in index_nan_age:
    age_pred = r["Age"][((r["SibSp"] == r.iloc[i]["SibSp"]) &(r["Parch"] == r.iloc[i]["Parch"])& (r["Pclass"] == r.iloc[i]["Pclass"]))].median()
    age_med = r["Age"].median()
    if not np.isnan(age_pred):
        r["Age"].iloc[i] = age_pred
    else:
        r["Age"].iloc[i] = age_med

In [61]:
r[r["Age"].isnull()]

### Feaature Engineering

In [62]:
r["Name"].head(10)

In [64]:
name = r["Name"]
r["Title"] = [i.split(".")[0].split(",")[-1].strip() for i in name]

In [65]:
r["Title"].head(10)

In [66]:
sns.countplot(x="Title", data = r)
plt.xticks(rotation = 60);

In [67]:
r["Title"] = r["Title"].replace(["Lady","the Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],"other")
r["Title"] = [0 if i == "Master" else 1 if i == "Miss" or i == "Ms" or i == "Mlle" or i == "Mrs" else 2 if i == "Mr" else 3 for i in r["Title"]]
r["Title"].head(20)

In [68]:
sns.countplot(x="Title", data = r)
plt.xticks(rotation = 60);

In [69]:
g = sns.factorplot(x = "Title", y = "Survived", data = r, kind = "bar")
g.set_xticklabels(["Master","Mrs","Mr","Other"])
g.set_ylabels("Survival Probability")

In [70]:
r.drop(labels = ["Name"], axis = 1, inplace = True)

In [71]:
r.head()

In [72]:
r["Fsize"] = r["SibSp"] + r["Parch"] + 1

In [73]:
r.head()

In [74]:
g = sns.factorplot(x = "Fsize", y = "Survived", data = r, kind = "bar")
g.set_ylabels("Survival");

In [75]:
r["family_size"] = [1 if i < 5 else 0 for i in r["Fsize"]]

In [76]:
r.head(10)

In [78]:
sns.countplot(x = "family_size", data = r);

In [79]:
g = sns.factorplot(x = "family_size", y = "Survived", data = r, kind = "bar")
g.set_ylabels("Survival");

In [82]:
sns.countplot(x = "Embarked", data = r);

In [83]:
r = pd.get_dummies(r, columns=["Embarked"])
r.head()

In [84]:
r["Ticket"].head(20)

In [86]:
tickets = []
for i in list(r.Ticket):
    if not i.isdigit():
        tickets.append(i.replace(".","").replace("/","").strip().split(" ")[0])
    else:
        tickets.append("x")
r["Ticket"] = tickets

In [87]:
r['Ticket'].head()

In [88]:
r = pd.get_dummies(r, columns= ["Ticket"], prefix = "T")
r.head(10)

In [89]:
sns.countplot(x = "Pclass", data = r);

In [90]:
r["Pclass"] = r["Pclass"].astype("category")
r = pd.get_dummies(r, columns= ["Pclass"])

In [91]:
r["Sex"] = r["Sex"].astype("category")
r = pd.get_dummies(r, columns=["Sex"])
r.head()

In [92]:
r.drop(labels = ["PassengerId", "Cabin"], axis = 1, inplace = True)

In [93]:
r.columns

### Model

In [94]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [95]:
rl

In [96]:
e = r[rl:]
e.drop(labels = ['Survived'], axis =1, inplace = True)

In [97]:
e.head()

In [99]:
r = r[:rl]
X = r.drop(labels = 'Survived', axis =1)
y = r['Survived']

a,d,s,f = train_test_split(X,y,test_size = 0.33, random_state = 42)

In [101]:
print(a.shape)
print(s.shape)
print(d.shape)
print(f.shape)

#### Logistic Regression

In [110]:
lr = LogisticRegression()
lr.fit(a,s)

print('training accuracy : ',round(lr.score(a,s)*100,2),'%')
print('testing accuracy : ',round(lr.score(d,f)*100,2),'%')

In [111]:
random_state = 42
classifier = [DecisionTreeClassifier(random_state = random_state),
             SVC(random_state = random_state),
             RandomForestClassifier(random_state = random_state),
             LogisticRegression(random_state = random_state),
             KNeighborsClassifier()]

dt_param_grid = {"min_samples_split" : range(10,500,20),
                "max_depth": range(1,20,2)}

svc_param_grid = {"kernel" : ["rbf"],
                 "gamma": [0.001, 0.01, 0.1, 1],
                 "C": [1,10,50,100,200,300,1000]}

rf_param_grid = {"max_features": [1,3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                "criterion":["gini"]}

logreg_param_grid = {"C":np.logspace(-3,3,7),
                    "penalty": ["l1","l2"]}

knn_param_grid = {"n_neighbors": np.linspace(1,19,10, dtype = int).tolist(),
                 "weights": ["uniform","distance"],
                 "metric":["euclidean","manhattan"]}
classifier_param = [dt_param_grid,
                    svc_param_grid,
                   rf_param_grid,
                   logreg_param_grid,
                   knn_param_grid]

In [113]:
cv_result = []
best_estimators = []
for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = "accuracy", n_jobs = -1,verbose = 1)
    clf.fit(a,s)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_result[i])

In [116]:
cv_results = pd.DataFrame({"Cross Validation Means":cv_result, "ML Models":["DecisionTreeClassifier", "SVM","RandomForestClassifier",
             "LogisticRegression",
             "KNeighborsClassifier"]})

g = sns.barplot("Cross Validation Means", "ML Models", data = cv_results)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross Validation Scores");

#### Ensemble model

In [117]:
votingC = VotingClassifier(estimators = [("dt",best_estimators[0]),
                                        ("rfc",best_estimators[2]),
                                        ("lr",best_estimators[3])],
                                        voting = "soft", n_jobs = -1)
votingC = votingC.fit(a, s)
print(accuracy_score(votingC.predict(d),f))