In [2]:
import pandas as pd
from skrules import SkopeRules
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings("ignore")
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score



In [6]:
train_df = pd.read_csv("data/train_titanic.csv")
test_df = pd.read_csv("data/test_titanic.csv")

In [7]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [12]:
train_df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [13]:
train_col=[ 'Pclass', 'Sex', 'SibSp','Parch', 'Embarked','Survived']
test_col=['Pclass', 'Sex', 'SibSp','Parch', 'Embarked']
feature=['Pclass', 'Sex', 'SibSp','Parch', 'Embarked']


In [14]:
train_df[train_col].dropna(axis = 0).info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 6 columns):
Pclass      889 non-null int64
Sex         889 non-null object
SibSp       889 non-null int64
Parch       889 non-null int64
Embarked    889 non-null object
Survived    889 non-null int64
dtypes: int64(4), object(2)
memory usage: 48.6+ KB


In [16]:
test_df[test_col].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
Pclass      418 non-null int64
Sex         418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Embarked    418 non-null object
dtypes: int64(3), object(2)
memory usage: 16.5+ KB


In [19]:
from sklearn import preprocessing

In [21]:
le_sex = preprocessing.LabelEncoder()
le_Embarked = preprocessing.LabelEncoder()

train_df["Sex"]=le_sex.fit_transform(train_df["Sex"].astype('str') )
test_df["Sex"]=le_sex.transform(test_df["Sex"].astype('str') )

train_df["Embarked"]=le_Embarked.fit_transform(train_df["Embarked"].astype('str') )
test_df["Embarked"]=le_Embarked.transform(test_df["Embarked"].astype('str') )

In [23]:
test_processed=test_df[test_col]
train_processed=train_df[train_col].dropna(axis = 0)

In [24]:
test_processed.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
Pclass      418 non-null int64
Sex         418 non-null int64
SibSp       418 non-null int64
Parch       418 non-null int64
Embarked    418 non-null int64
dtypes: int64(5)
memory usage: 16.5 KB


In [25]:
train_processed.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 6 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
SibSp       891 non-null int64
Parch       891 non-null int64
Embarked    891 non-null int64
Survived    891 non-null int64
dtypes: int64(6)
memory usage: 48.7 KB


In [26]:
train_processed.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Survived
0,3,1,1,0,2,0
1,1,0,1,0,0,1
2,3,0,0,0,2,1
3,1,0,1,0,2,1
4,3,1,0,0,2,0


In [27]:
train_processed.corr()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Survived
Pclass,1.0,0.1319,0.083081,0.018443,0.157112,-0.338481
Sex,0.1319,1.0,-0.114631,-0.245489,0.104057,-0.543351
SibSp,0.083081,-0.114631,1.0,0.414838,0.066654,-0.035322
Parch,0.018443,-0.245489,0.414838,1.0,0.038322,0.081629
Embarked,0.157112,0.104057,0.066654,0.038322,1.0,-0.163517
Survived,-0.338481,-0.543351,-0.035322,0.081629,-0.163517,1.0


# Prediction

## Naive Bayes

In [34]:
gnb = GaussianNB()
scoring = ['precision_macro', 'recall_macro','f1_macro']
scores = cross_validate(gnb, train_processed.iloc[:,:-1],train_processed.iloc[:,-1], scoring=scoring, cv=10)

In [35]:
scores=pd.DataFrame(scores)
scores

Unnamed: 0,fit_time,score_time,test_precision_macro,test_recall_macro,test_f1_macro
0,0.004123,0.007148,0.767145,0.761039,0.763655
1,0.003342,0.004666,0.707692,0.716578,0.70972
2,0.003681,0.006152,0.753736,0.736096,0.742029
3,0.003019,0.004502,0.756352,0.769786,0.758058
4,0.002611,0.004731,0.786636,0.797059,0.790094
5,0.005507,0.005265,0.762884,0.756417,0.759199
6,0.00251,0.004619,0.797354,0.800535,0.79883
7,0.00254,0.004781,0.774351,0.771123,0.772623
8,0.002504,0.004436,0.77614,0.765508,0.769775
9,0.002382,0.004313,0.813582,0.806614,0.809626


In [36]:
scores.mean()

fit_time                0.003222
score_time              0.005061
test_precision_macro    0.769587
test_recall_macro       0.768075
test_f1_macro           0.767361
dtype: float64

## Random Forest

In [31]:
rclf=RandomForestClassifier()
scoring = ['precision_macro', 'recall_macro','f1_macro']
scores = cross_validate(rclf, train_processed.iloc[:,:-1],train_processed.iloc[:,-1], scoring=scoring, cv=10)

In [32]:
scores=pd.DataFrame(scores)
scores

Unnamed: 0,fit_time,score_time,test_precision_macro,test_recall_macro,test_f1_macro
0,0.181026,0.012303,0.794471,0.754545,0.76443
1,0.135115,0.012359,0.78405,0.754278,0.763066
2,0.135325,0.01266,0.751923,0.71016,0.718678
3,0.137818,0.012297,0.832285,0.839037,0.835185
4,0.145037,0.013581,0.808962,0.815241,0.81164
5,0.146854,0.013056,0.794496,0.768984,0.777253
6,0.187696,0.017932,0.78405,0.754278,0.763066
7,0.157405,0.013849,0.740777,0.695455,0.703204
8,0.149996,0.013786,0.827839,0.787166,0.79887
9,0.149214,0.013586,0.802569,0.767989,0.777253


In [33]:
scores.mean()

fit_time                0.152549
score_time              0.013541
test_precision_macro    0.792142
test_recall_macro       0.764713
test_f1_macro           0.771264
dtype: float64

## Skope Rule Classifier


In [37]:
clf = SkopeRules(max_depth_duplication=None,
                 n_estimators=30,
                 precision_min=0.2,
                 recall_min=0.01,
                 feature_names=feature)

In [38]:
scoring = ['precision_macro', 'recall_macro','f1_macro']
scores = cross_validate(clf, train_processed.iloc[:,:-1],train_processed.iloc[:,-1], scoring=scoring, cv=10)

In [39]:
scores=pd.DataFrame(scores)
scores

Unnamed: 0,fit_time,score_time,test_precision_macro,test_recall_macro,test_f1_macro
0,1.967947,0.082286,0.714978,0.700603,0.675817
1,1.898602,0.080799,0.617886,0.668116,0.571658
2,1.850104,0.123293,0.727463,0.732086,0.696476
3,1.731672,0.105028,0.712529,0.683333,0.67348
4,1.857456,0.127597,0.732442,0.730509,0.707828
5,1.746851,0.13167,0.74484,0.760143,0.717819
6,1.912437,0.090499,0.744582,0.743077,0.730303
7,1.866643,0.073285,0.738718,0.751894,0.717819
8,1.780247,0.086948,0.789888,0.822316,0.771795
9,1.822203,0.099183,0.718029,0.725108,0.685035


In [40]:
scores.mean()

fit_time                1.843416
score_time              0.100059
test_precision_macro    0.724136
test_recall_macro       0.731719
test_f1_macro           0.694803
dtype: float64

# Submission

In [41]:
rclf=RandomForestClassifier()
rclf.fit(train_processed.iloc[:,:-1],train_processed.iloc[:,-1])
prediction=rclf.predict(test_processed)


In [43]:
radnomforest_prediction=pd.DataFrame()
radnomforest_prediction["PassengerId"]=test_df["PassengerId"]
radnomforest_prediction["Survived"]=prediction

In [44]:
radnomforest_prediction.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [45]:
radnomforest_prediction.to_csv("data/rf_prediction.csv",index=False)

In [46]:
gnb = GaussianNB()
gnb.fit(train_processed.iloc[:,:-1],train_processed.iloc[:,-1])
nb_prediction=gnb.predict(test_processed)

naive_bayes_prediction=pd.DataFrame()
naive_bayes_prediction["PassengerId"]=test_df["PassengerId"]
naive_bayes_prediction["Survived"]=nb_prediction

In [47]:
naive_bayes_prediction.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [48]:
naive_bayes_prediction.to_csv("data/naive_bayes_prediction.csv",index=False)

In [50]:
clf = SkopeRules(max_depth_duplication=None,
                 n_estimators=30,
                 precision_min=0.2,
                 recall_min=0.01,
                 feature_names=feature)
clf.fit(train_processed.iloc[:,:-1],train_processed.iloc[:,-1])
sr_prediction=clf.predict(test_processed)


skope_rule_prediction=pd.DataFrame()
skope_rule_prediction["PassengerId"]=test_df["PassengerId"]
skope_rule_prediction["Survived"]=sr_prediction

In [51]:
skope_rule_prediction.to_csv("data/skope_rule_prediction.csv",index=False)
