In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_data=pd.read_csv('./input/train.csv') 
test_data=pd.read_csv('./input/test.csv')

target = train_data['Survived']

In [None]:
sns.barplot(x="Embarked", y="Survived", hue="Sex", data=train_data)

In [None]:
print('Shape of train dataset:-',train_data.shape)
print('Shape of test dataset:-' ,test_data.shape)

#Info about datatype and statistical model

print('\n')
print(train_data.info())
train_data.describe()

In [None]:
#Combining both train and test dataset
total=pd.concat([train_data.drop('Survived',axis=1),test_data])
target=train_data['Survived']

total.head()

In [None]:
def missing_value_percentage(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(total/len(df)*100,2)
    print(pd.concat([total, percent], axis=1, keys=['Total','Percent']))

missing_value_percentage(total)

In [None]:
sns.heatmap(total.drop('PassengerId',axis=1).corr(),annot=True) 

In [None]:
def column_percent_value(df, feature):
    percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))
    total = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))

    total.columns = ["Total"]
    percent.columns = ['Percent']
    return pd.concat([total, percent], axis = 1)

In [None]:
column_percent_value(total, 'Embarked')

In [None]:
total.Embarked.fillna("S", inplace=True)
total['Age'] = total.Age.fillna(total.Age.median())
total['Fare'] = total.Fare.fillna(total.Fare.median())

In [None]:
print(total.isnull().sum())

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
total['Sex'] = encoder.fit_transform(total['Sex'])
total['Embarked'] = encoder.fit_transform(total['Embarked'])
total=pd.get_dummies(total, columns = ['Pclass','Embarked'])

In [None]:
total['Fare_1_S']=total['Embarked_2']*total['Pclass_1']*total['Sex']

In [None]:
def simplify_ages(df):
    df['Age_cat'] = pd.qcut(total['Age'],q=[0, .16, .33, .49, .66, .83, 1], labels=False, precision=1)
    return df

# def simplify_cabins(df):
#     df.Cabin = df.Cabin.fillna('N')
#     df.Cabin = df.Cabin.apply(lambda x: x[0])
#     return df

def format_name(df):
#     #Extracting Title from name
    df['Title'] =df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] =df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] =df['Title'].replace('Mlle', 'Miss')
    df['Title'] =df['Title'].replace('Ms', 'Miss')
    df['Title'] =df['Title'].replace('Mme', 'Mrs')
    #Mapping titles to numerical data
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 2, "Master": 3, "Rare": 4}
    df['Title'] =df['Title'].map(title_mapping)
    df['Title'] =df['Title'].fillna(0)
#     df['Lname'] = df.Name.apply(lambda x: x.split(' ')[0])
#     df['NamePrefix'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df

def simplify_family(df):
    #Family group
    df['FamilySize'] =df['SibSp'] + df['Parch'] + 1
    df['FamilySize_cat'] =df['FamilySize'].map(lambda x: 1 if x == 1 else (2 if 5 > x >= 2 else (3 if 8 > x >= 5 else 4 )))
    return df

def drop_features(df):
    return df.drop(['Cabin', 'Ticket', 'Name'], axis=1)

In [None]:
def fare_category(fr): 
    if fr <= 7.91:
        return 1
    elif fr <= 14.454 and fr > 7.91:
        return 2
    elif fr <= 31 and fr > 14.454:
        return 3
    return 4
total['Fare_cat'] =total['Fare'].apply(fare_category)

In [None]:
total = simplify_ages(total)

In [None]:
total = simplify_family(total)

In [None]:
total = format_name(total)

In [None]:
total = drop_features(total)

In [None]:
total.describe(include='all')

In [None]:
total.columns

In [None]:
#Dummy variable
total=pd.get_dummies(total,columns=['SibSp','Parch','Title','FamilySize','Fare_cat','FamilySize_cat'])

In [None]:
total['Age']=total['Age'].astype(int)

In [None]:
train_data = total[:len(train_data)]
test_data = total[len(test_data):]
test_data.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size = 0.2, random_state=42) 

In [None]:
#MODELS IMPORT

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import RandomizedSearchCV

In [None]:

models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(max_iter=10000), 
          "Random Forest": RandomForestClassifier(),
          "SVC" : SVC(probability=True),
          "DecisionTreeClassifier" : DecisionTreeClassifier(),
          "AdaBoostClassifier" : AdaBoostClassifier(),
          "GradientBoostingClassifier" : GradientBoostingClassifier(),
          "GaussianNB" : GaussianNB(),
          "LinearDiscriminantAnalysis" : LinearDiscriminantAnalysis(),
          "QuadraticDiscriminantAnalysis" : QuadraticDiscriminantAnalysis()}
def fit_and_score(models, X_train, X_test, y_train, y_test):

    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Predicting target values
        y_pred = model.predict(X_test)
        # Evaluate the model and append its score to model_scores
        #model_scores[name] = model.score(X_test, y_test)
        model_scores[name] = roc_auc_score(y_pred, y_test)
    return model_scores
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores

In [None]:
test_data.describe()

In [None]:
clf_random_forest = LogisticRegression(max_iter=10000)
clf_random_forest.fit(train_data, target)

prediction = clf_random_forest.predict(test_data)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': prediction})
output.to_csv('submission_v6.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(colsample_bylevel= 0.9,
                    colsample_bytree = 0.8, 
                    gamma=0.99,
                    max_depth= 5,
                    min_child_weight= 1,
                    n_estimators= 10,
                    nthread= 4,
                    random_state= 2,
                    silent= True)
classifier.fit(X_train,y_train)
classifier.score(X_test,y_test)

In [None]:
test_data.shape

In [None]:
prediction = classifier.predict(test_data)
output = pd.DataFrame({'PassengerId': test_data.PassengerId,'Survived': prediction})
output.to_csv('submission_v7.csv', index=False)