In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [3]:
train_df=pd.read_csv("train.csv")
test_df=pd.read_csv("test.csv")
gender_df=pd.read_csv("gender_submission.csv")

In [4]:
y_train=train_df.pop('Survived')

In [5]:
train_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
train_df['Age'].fillna(train_df.Age.mean(),inplace=True)

In [7]:
numerical_cols=list(train_df.dtypes[train_df.dtypes != 'object'].index)
categorical_cols=list(train_df.dtypes[train_df.dtypes == 'object'].index)

In [8]:
print(numerical_cols,categorical_cols,sep='\n')

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [9]:
model1=RandomForestRegressor(n_estimators=100,oob_score=True,random_state=42)

In [10]:
model1.fit(train_df[numerical_cols],y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=True, random_state=42, verbose=0,
                      warm_start=False)

In [11]:
model1.oob_score_

0.1361695005913669

In [12]:
y_train_oob=model1.oob_prediction_

In [13]:
print("C-Stat",roc_auc_score(y_train,y_train_oob))

C-Stat 0.7399551550399983


In [14]:
def binarize_gender(val):
    if val.lower()=='male':
        return 1
    elif val.lower()=='female':
        return 0
    else:
        return -1

In [15]:
train_df["Sex"]=train_df["Sex"].apply(binarize_gender)

In [16]:
train_df['Embarked'].fillna('Null',inplace=True)

In [17]:
def num_emb(val):
    if val=='S':
        return 1
    elif val=='C':
        return 2
    elif val=='Q':
        return 3
    else:
        return 0

In [40]:
train_df['Embarked']=train_df['Embarked'].apply(num_emb)
numerical_cols=list(train_df.dtypes[train_df.dtypes != 'object'].index)
categorical_cols=list(train_df.dtypes[train_df.dtypes == 'object'].index)

In [41]:
train_df.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208,0.0
std,257.353842,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429,0.0
min,1.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,2.0,0.0,22.0,0.0,0.0,7.9104,0.0
50%,446.0,3.0,1.0,29.699118,0.0,0.0,14.4542,0.0
75%,668.5,3.0,1.0,35.0,1.0,0.0,31.0,0.0
max,891.0,3.0,1.0,80.0,8.0,6.0,512.3292,0.0


In [42]:
cdf=train_df[numerical_cols[1:]]

In [43]:
cdf.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208,0.0
std,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429,0.0
min,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,2.0,0.0,22.0,0.0,0.0,7.9104,0.0
50%,3.0,1.0,29.699118,0.0,0.0,14.4542,0.0
75%,3.0,1.0,35.0,1.0,0.0,31.0,0.0
max,3.0,1.0,80.0,8.0,6.0,512.3292,0.0


In [44]:
model2=RandomForestRegressor(n_estimators=100,oob_score=True,random_state=42)
model2.fit(cdf,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=True, random_state=42, verbose=0,
                      warm_start=False)

In [45]:
model2.oob_score_

0.4008242027629747

In [46]:
y_train_oob2=model2.oob_prediction_

In [47]:
print("C-Stat",roc_auc_score(y_train,y_train_oob2))

C-Stat 0.8560700476144825


In [48]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,417.0,418.0
mean,1100.5,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,1.464115
std,120.810458,0.841838,0.481622,12.634534,0.89676,0.981429,55.907576,0.685516
min,892.0,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,996.25,1.0,0.0,23.0,0.0,0.0,7.8958,1.0
50%,1100.5,3.0,1.0,30.27259,0.0,0.0,14.4542,1.0
75%,1204.75,3.0,1.0,35.75,1.0,0.0,31.5,2.0
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292,3.0


In [49]:
test_df['Age'].fillna(test_df.Age.mean(),inplace=True)
test_df["Sex"]=test_df["Sex"].apply(binarize_gender)
test_df['Embarked'].fillna('Null',inplace=True)
test_df['Embarked']=test_df['Embarked'].apply(num_emb)


AttributeError: 'int' object has no attribute 'lower'

In [50]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,417.0,418.0
mean,1100.5,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,1.464115
std,120.810458,0.841838,0.481622,12.634534,0.89676,0.981429,55.907576,0.685516
min,892.0,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,996.25,1.0,0.0,23.0,0.0,0.0,7.8958,1.0
50%,1100.5,3.0,1.0,30.27259,0.0,0.0,14.4542,1.0
75%,1204.75,3.0,1.0,35.75,1.0,0.0,31.5,2.0
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292,3.0


In [51]:
numerical_cols=list(train_df.dtypes[train_df.dtypes != 'object'].index)
categorical_cols=list(train_df.dtypes[train_df.dtypes == 'object'].index)

In [52]:
numerical_cols

['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [53]:
input_df=test_df[numerical_cols[1:]]

In [54]:
input_df.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,417.0,418.0
mean,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,1.464115
std,0.841838,0.481622,12.634534,0.89676,0.981429,55.907576,0.685516
min,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,1.0,0.0,23.0,0.0,0.0,7.8958,1.0
50%,3.0,1.0,30.27259,0.0,0.0,14.4542,1.0
75%,3.0,1.0,35.75,1.0,0.0,31.5,2.0
max,3.0,1.0,76.0,8.0,9.0,512.3292,3.0


In [55]:
input_df["Fare"].fillna(input_df.Fare.mean(),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [56]:
input_df.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,1.464115
std,0.841838,0.481622,12.634534,0.89676,0.981429,55.8405,0.685516
min,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,1.0,0.0,23.0,0.0,0.0,7.8958,1.0
50%,3.0,1.0,30.27259,0.0,0.0,14.4542,1.0
75%,3.0,1.0,35.75,1.0,0.0,31.5,2.0
max,3.0,1.0,76.0,8.0,9.0,512.3292,3.0


In [57]:
tprediction=model2.predict(input_df)

In [58]:
input_df["Probability"]=tprediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [59]:
input_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Probability
0,3,1,34.5,0,0,7.8292,3,0.14
1,3,0,47.0,1,0,7.0,1,0.15
2,2,1,62.0,0,0,9.6875,3,0.66
3,3,1,27.0,0,0,8.6625,1,0.71
4,3,0,22.0,1,1,12.2875,1,0.45


In [60]:
input_df.to_csv("predicted_probs.csv",index=False)

In [61]:
test_df["Fare"].fillna(test_df.Fare.mean(),inplace=True)

In [62]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,3
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,1
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,3
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,1


In [63]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,1.464115
std,120.810458,0.841838,0.481622,12.634534,0.89676,0.981429,55.8405,0.685516
min,892.0,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,996.25,1.0,0.0,23.0,0.0,0.0,7.8958,1.0
50%,1100.5,3.0,1.0,30.27259,0.0,0.0,14.4542,1.0
75%,1204.75,3.0,1.0,35.75,1.0,0.0,31.5,2.0
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292,3.0


In [64]:
test_df["Probability"]=tprediction

In [65]:
test_df["Survived"]=0

In [66]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Probability,Survived
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,3,0.14,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,1,0.15,0
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,3,0.66,0
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,1,0.71,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,1,0.45,0


In [67]:
def surv(val):
    if val<0.45:
        return 0
    else:
        return 1

In [68]:
test_df["Survived"]=test_df["Probability"].apply(surv)

In [69]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Probability,Survived
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,3,0.14,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,1,0.15,0
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,3,0.66,1
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,1,0.71,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,1,0.45,1


In [70]:
test_df.to_csv("prediction001.csv",index=False)

In [71]:
test_df[["PassengerId","Survived"]].to_csv("submission001.csv",index=False)

In [73]:
rf_model=RandomForestClassifier(n_estimators=1000,max_depth=7)
rf_model.fit(cdf,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [79]:
pred3=rf_model.predict(test_df[numerical_cols[1:]])

In [81]:
pred3

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [82]:
test_df["Survived"]=pred3

In [84]:
test_df.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Probability,Survived
413,1305,3,"Spector, Mr. Woolf",1,30.27259,0,0,A.5. 3236,8.05,,1,0.01,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9,C105,2,1.0,1
415,1307,3,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.25,,1,0.0,0
416,1308,3,"Ware, Mr. Frederick",1,30.27259,0,0,359309,8.05,,1,0.01,0
417,1309,3,"Peter, Master. Michael J",1,30.27259,1,1,2668,22.3583,,2,0.14,0


In [85]:
test_df[["PassengerId","Survived"]].to_csv("submission002.csv",index=False)

In [86]:
test_df.to_csv("prediction002.csv",index=False)