In [2]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier


df_train=pd.read_csv('/kaggle/input/titanic/train.csv')
df_test=pd.read_csv('/kaggle/input/titanic/test.csv')
df=pd.concat([df_train,df_test],axis=0)

PassengerId = df_test['PassengerId']

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
df['Family']=df['Parch']+df['SibSp']

df['Cabin'] = df['Cabin'].apply(lambda x: x[0] if not pd.isnull(x) else 'NoCabin')


label_encoder = LabelEncoder()
non_numeric_columns = ['Cabin','Sex']  # Add other non-numeric columns if needed
for column in non_numeric_columns:
    df[column] = label_encoder.fit_transform(df[column])
    
    
df['Ticket'] = df['Ticket'].apply(lambda x: x.replace('.', '').replace('/', '').split(' ')[0] if not (x.split(' ')[0][0]).isdigit() else 'X')
df['Ticket'] = df['Ticket'].astype('category').cat.codes

df['Embarked'].fillna('S', inplace = True)
emb = pd.get_dummies(df['Embarked'], drop_first = True)
df = pd.concat([df, emb], axis = 1)
df['Q'] = np.where(df['Q'], 1, 0)
df['S'] = np.where(df['S'], 1, 0)

df.drop(['SibSp', 'Parch','Name','Embarked'], axis = 1, inplace = True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Family,Q,S
0,1,0.0,3,1,22.0,2,7.2500,7,1,0,1
1,2,1.0,1,0,38.0,14,71.2833,2,1,0,0
2,3,1.0,3,0,26.0,31,7.9250,7,0,0,1
3,4,1.0,1,0,35.0,36,53.1000,2,1,0,1
4,5,0.0,3,1,35.0,36,8.0500,7,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,1,,2,8.0500,7,0,0,1
414,1306,,1,0,39.0,14,108.9000,2,0,0,0
415,1307,,3,1,38.5,28,7.2500,7,0,0,1
416,1308,,3,1,,36,8.0500,7,0,0,1


In [4]:
df.isnull().sum()

PassengerId      0
Survived       418
Pclass           0
Sex              0
Age            263
Ticket           0
Fare             1
Cabin            0
Family           0
Q                0
S                0
dtype: int64

In [5]:
df['Fare']=df['Fare'].fillna(df['Fare'].mean())
dataAgeNull = df[df["Age"].isnull()]
dataAgeNotNull = df[df["Age"].notnull()]


In [6]:
df.isnull().sum()

PassengerId      0
Survived       418
Pclass           0
Sex              0
Age            263
Ticket           0
Fare             0
Cabin            0
Family           0
Q                0
S                0
dtype: int64

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
import pandas as pd

param_grid = {
    'n_estimators': [100, 500, 1000, 2000],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2', 0.5]  # Example: also include a float value
}


rfModel_age = ExtraTreesRegressor(random_state=42)
ageColumns = ['Fare', 'Pclass', 'Sex', 'Family', 'Cabin', 'Ticket', 'Q', 'S']
# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rfModel_age, param_grid=param_grid, 
                           cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(dataAgeNotNull[ageColumns], dataAgeNotNull["Age"])

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Negative Mean Squared Error:", best_score)

best_rfModel_age = grid_search.best_estimator_

# Predict age for null values
ageNullValues = best_rfModel_age.predict(X=dataAgeNull[ageColumns])

# Assign predicted ages to null values
dataAgeNull.loc[:, "Age"] = ageNullValues

df = pd.concat([dataAgeNotNull, dataAgeNull], axis=0)
df.reset_index(inplace=True, drop=True)


In [None]:
df_train = df[df['Survived'].notnull()].sort_values(by = "PassengerId")
df_test = df[df['Survived'].isnull()].sort_values(by = 'PassengerId')
df_test.drop(['Survived'], axis = 1, inplace = True)

In [None]:
X=df_train.drop('Survived',axis=1)
y=df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


extra_trees_clf = ExtraTreesClassifier(n_estimators=2000, random_state=42)
extra_trees_clf.fit(X_train, y_train)


y_pred = extra_trees_clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


In [None]:
output = extra_trees_clf.predict(df_test)
output = output.astype(int)
output = pd.DataFrame({'PassengerId':PassengerId, 'Survived': output})

output.to_csv('submission.csv', index = False)
output

In [None]:
from IPython.display import FileLink

file_link = FileLink('submission.csv')


file_link