In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.metrics import accuracy_score


In [2]:
titanic_data = pd.read_csv('train.csv')
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
class Features_dropper:

    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        X.drop(['Name','PassengerId','Ticket','Cabin'],axis=1,inplace=True)
        return X

In [4]:
class AgeImputer:
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        X.replace(np.nan,int(X['Age'].mean()),inplace=True)
        X.dropna(inplace=True)
        return X

In [5]:
class LabelEncoding:
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        X['Sex'] = X['Sex'].astype(str)
        X['Embarked'] = X['Embarked'].astype(str)
        le = LabelEncoder()
        X['Sex'] = le.fit_transform(X['Sex'])
        X['Embarked'] = le.fit_transform(X['Embarked'])

        return X

In [6]:
class OneHotEncoding:
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        ohe = OneHotEncoder()
        categorical_columns = ['Sex','Pclass','Embarked']
        X_categorical_encoded = ohe.fit_transform(X[categorical_columns])
        encoded_feature_names = ohe.get_feature_names_out(categorical_columns)
        
        X_categorical_encoded_df = pd.DataFrame(X_categorical_encoded.toarray(),columns=encoded_feature_names)
        X = pd.concat([X[['Age','SibSp','Parch','Fare']],X_categorical_encoded_df],axis=1)
        return X


In [7]:
pipe = Pipeline([('Features dropper',Features_dropper()),('AgeImputer',AgeImputer()),('LabelEncoder',LabelEncoding()),('OneHotEncoder',OneHotEncoding())])

In [10]:
data = pd.read_csv('train.csv')

X = data.drop('Survived',axis=1)
Y = data[['Survived']]

X = pipe.fit_transform(X)

print(X.head())
Y.head()


Unnamed: 0,Age,SibSp,Parch,Fare,Sex_0,Sex_1,Pclass_1,Pclass_2,Pclass_3,Embarked_0,Embarked_1,Embarked_2,Embarked_3
0,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,35.0,1,0,53.1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [11]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=31)

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_0,Sex_1,Pclass_1,Pclass_2,Pclass_3,Embarked_0,Embarked_1,Embarked_2
0,34.5,0,0,7.8292,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,47.0,1,0,7.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,62.0,0,0,9.6875,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,27.0,0,0,8.6625,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,22.0,1,1,12.2875,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [None]:
model = DecisionTreeClassifier(max_depth=6)
model.fit(x_train,y_train)

yhat = model.predict(x_test)

In [None]:
accuracy_score(y_test,yhat)

In [None]:
test_data = pd.read_csv('test.csv')
prep_test_data = test_data.copy()
prep_test_data = pipe.fit_transform(prep_test_data)

print(prep_test_data.shape)
prep_test_data.head()

In [None]:
Embarked_3 = np.zeros((418,1),float)
prep_test_data['Embarked_3'] = Embarked_3
prep_test_data.head()

In [None]:
predictions = model.predict(prep_test_data)
prediction_DF = pd.concat((test_data['PassengerId'],pd.DataFrame(predictions)),axis=1)
prediction_DF.columns = ['PassengerId','Survived']
prediction_DF.head()

In [None]:
prediction_DF.to_csv('predictions.csv',index=False)