In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


train_data=pd.read_csv("./train.csv")
test_data=pd.read_csv("./test.csv")

train_data.info()

In [None]:
all_data=pd.concat([train_data,test_data],ignore_index=True)
all_data['Title']=all_data['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())


#print(all_data['Title'])
Title_Dict={}
Title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
Title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
Title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
Title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
Title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
Title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))

all_data['Title']=all_data['Title'].map(Title_Dict)

all_data['FamilySize']=all_data['SibSp']+all_data['Parch']+1
#sns.barplot(x='FamilySize',y='Survived',data=all_data)


def Fam_label(x):
    if(x>=2)&(x<=4):
        return 2
    elif(x<=7):
        return 1
    else:
        return 0

all_data['FamilyLabel']=all_data['FamilySize'].apply(Fam_label)


all_data['Cabin']=all_data['Cabin'].fillna('Unknown')
all_data['Deck']=all_data['Cabin'].str.get(0)

print(all_data['Ticket'].value_counts())
Ticket_Count=dict(all_data['Ticket'].value_counts())
all_data['TicketGroup']=all_data['Ticket'].apply(lambda x:Ticket_Count[x])

def Ticket_Label(x):
    if(x>=2&x<=4):
        return 2
    elif (x<=8):
        return 1
    else:
        return 0

all_data['TicketLabel']=all_data['TicketGroup'].apply(Ticket_Label)


In [None]:
from sklearn.ensemble import RandomForestRegressor

age_df = all_data[['Age', 'Pclass','Sex','Title']]
age_df=pd.get_dummies(age_df)
known_age = age_df[age_df.Age.notnull()].iloc[:,:].values
unknown_age = age_df[age_df.Age.isnull()].iloc[:,:].values
#print(known_age,unknown_age)

y=known_age[:,0]
X=known_age[:,1:]

rfr=RandomForestRegressor(random_state=0,n_estimators=100,n_jobs=-1)
rfr.fit(X,y)

predictedAges=rfr.predict(unknown_age[:,1::])
print(predictedAges)
all_data.loc[(all_data.Age.isnull()),'Age']=predictedAges



In [None]:

all_data['Embarked']=all_data['Embarked'].fillna('C')
fare=all_data[(all_data['Embarked']=='S')&(all_data['Pclass']==3)]['Fare'].median()

all_data['Fare']=all_data['Fare'].fillna(fare)
all_data['Surname']=all_data['Name'].apply(lambda x:x.split(',')[0].strip())
Surname_Count = dict(all_data['Surname'].value_counts())
all_data['FamilyGroup'] = all_data['Surname'].apply(lambda x:Surname_Count[x])
Female_Child_Group=all_data.loc[(all_data['FamilyGroup']>=2) & ((all_data['Age']<=12) | (all_data['Sex']=='female'))]
Male_Adult_Group=all_data.loc[(all_data['FamilyGroup']>=2) & (all_data['Age']>12) & (all_data['Sex']=='male')]

In [None]:
Female_Child_Group=Female_Child_Group.groupby('Surname')['Survived'].mean()
Dead_List=set(Female_Child_Group[Female_Child_Group.apply(lambda x:x==0)].index)
print(Dead_List)
Male_Adult_List=Male_Adult_Group.groupby('Surname')['Survived'].mean()
Survived_List=set(Male_Adult_List[Male_Adult_List.apply(lambda x:x==1)].index)
print(Survived_List)

In [None]:
train_data=all_data.loc[all_data['Survived'].notnull()]
test_data=all_data.loc[all_data['Survived'].isnull()]
test_data.loc[(test_data['Surname'].apply(lambda x:x in Dead_List)),'Sex'] = 'male'
test_data.loc[(test_data['Surname'].apply(lambda x:x in Dead_List)),'Age'] = 60
test_data.loc[(test_data['Surname'].apply(lambda x:x in Dead_List)),'Title'] = 'Mr'
test_data.loc[(test_data['Surname'].apply(lambda x:x in Survived_List)),'Sex'] = 'female'
test_data.loc[(test_data['Surname'].apply(lambda x:x in Survived_List)),'Age'] = 5
test_data.loc[(test_data['Surname'].apply(lambda x:x in Survived_List)),'Title'] = 'Miss'

In [None]:
all_data=pd.concat([train_data, test_data])
all_data=all_data[['Survived','Pclass','Sex','Age','Fare','Embarked','Title','FamilyLabel','Deck','TicketGroup']]
all_data=pd.get_dummies(all_data)
train=all_data[all_data['Survived'].notnull()]
test=all_data[all_data['Survived'].isnull()].drop('Survived',axis=1)
X = train.iloc[:,:].values[:,1:]
y = train.iloc[:,:].values[:,0]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest


pipe=Pipeline([('select',SelectKBest(k=20)), 
               ('classify', RandomForestClassifier(random_state = 10, max_features = 'sqrt'))])

param_test = {'classify__n_estimators':list(range(20,50,2)), 
              'classify__max_depth':list(range(3,60,3))}
gsearch = GridSearchCV(estimator = pipe, param_grid = param_test, scoring='roc_auc', cv=10)
gsearch.fit(X,y)

print(gsearch.best_params_, gsearch.best_score_)

In [None]:
from sklearn.pipeline import make_pipeline
select = SelectKBest(k = 20)
clf = RandomForestClassifier(random_state = 10, warm_start = True, 
                                  n_estimators = 32,
                                  max_depth = 6, 
                                  max_features = 'sqrt')
pipeline = make_pipeline(select, clf)
pipeline.fit(X, y)


In [None]:
from sklearn import model_selection, metrics
cv_score = model_selection.cross_val_score(pipeline, X, y, cv= 10)
print("CV Score : Mean - %.7g | Std - %.7g " % (np.mean(cv_score), np.std(cv_score)))
predictions = pipeline.predict(test)
submission=pd.DataFrame({"PassengerId":test_data['PassengerId'],"Survived":predictions.astype(np.int32)})
submission.to_csv("submission.csv",index=False)