### Import Library and extract Data 

In [50]:
import pandas as pd

In [80]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Survived'] = 0

### Data Preparation Function

In [None]:
def Preprocess(df_train,df_test):

    df = pd.concat([df_train,df_test], axis=0)
    df = df.drop(columns=['PassengerId','Name','Ticket'], axis=1)

# Imputing Missing Values

    df['Age'] = df["Age"].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Cabin'] = df['Cabin'].fillna('X000')
    df['Embarked'] = df['Embarked'].fillna('X')

# Extract Cabin columns and create 2 columns

    df['cabin_letter'] = df['Cabin'].str.extract(r'([a-zA-z+])', expand=False)
    df['cabin_number'] = df['Cabin'].str.extract(r'(\d+)', expand=False)
    df['cabin_number'] = df['cabin_number'].fillna(0)
    df['cabin_number'] = pd.to_numeric(df['cabin_number'])
    df.drop(columns=['Cabin'], axis=1, inplace=True)

# One-hot encoding for Categorical columns

    df = pd.get_dummies(df,columns=['Sex','Embarked','cabin_letter'])
    df = df.drop(columns=['cabin_letter_X','Embarked_X'], axis=1)

# Create The New Columns 

    df['Fare_bin_Pclass'] = df['Fare']//df['Pclass']
    df['Pclass_bin_sex'] = df['Pclass'] - df['Sex_female']

# Spliting Data into Training and testing sets

    df_train,df_test = df[:len(df_train)],df[len(df_train):]
    df_test.drop(columns=['Survived'], axis=1, inplace=True)

    return df_train, df_test

In [106]:
train, test = Preprocess(df_train, df_test)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop(columns=['Survived'], axis=1, inplace=True)


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,cabin_number,Sex_female,Sex_male,Embarked_C,...,cabin_letter_A,cabin_letter_B,cabin_letter_C,cabin_letter_D,cabin_letter_E,cabin_letter_F,cabin_letter_G,cabin_letter_T,Fare_bin_Pclass,Pclass_bin_sex
0,0,3,22.0,1,0,7.25,0,False,True,False,...,False,False,False,False,False,False,False,False,2.0,3
1,1,1,38.0,1,0,71.2833,85,True,False,True,...,False,False,True,False,False,False,False,False,71.0,0
2,1,3,26.0,0,0,7.925,0,True,False,False,...,False,False,False,False,False,False,False,False,2.0,2
3,1,1,35.0,1,0,53.1,123,True,False,False,...,False,False,True,False,False,False,False,False,53.0,0
4,0,3,35.0,0,0,8.05,0,False,True,False,...,False,False,False,False,False,False,False,False,2.0,3


In [100]:
train.corr()['Survived'].sort_values(ascending=False)

Survived           1.000000
Sex_female         0.543351
Fare_bin_Pclass    0.267823
Fare               0.257307
cabin_number       0.229756
cabin_letter_B     0.175095
Embarked_C         0.168240
cabin_letter_D     0.150716
cabin_letter_E     0.145321
cabin_letter_C     0.114652
Parch              0.081629
cabin_letter_F     0.057935
cabin_letter_A     0.022287
cabin_letter_G     0.016040
Embarked_Q         0.003650
cabin_letter_T    -0.026456
SibSp             -0.035322
Age               -0.070323
Embarked_S        -0.155660
Pclass            -0.338481
Pclass_bin_sex    -0.533994
Sex_male          -0.543351
Name: Survived, dtype: float64

### Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [107]:
X = train.drop(columns='Survived')
y = train['Survived']
X_train,X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
# Y_train = np.reshape(Y_train(-1,1))

In [108]:
model_1 = LogisticRegression()
model_1.fit(X_train, Y_train)
pred = model_1.predict(X_test)
accuracy_score(Y_test,pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.776536312849162

In [109]:
model_2 = RandomForestClassifier()
model_2.fit(X_train,Y_train)
pred = model_2.predict(X_test)
accuracy_score(Y_test, pred)

0.8379888268156425

In [110]:
model_3 = XGBClassifier()
model_3.fit(X_train,Y_train)
pred = model_3.predict(X_test)
accuracy_score(Y_test, pred)

0.88268156424581

### Implementing the Model_3 to Test data

Using Model_3 Due to High Accuracy

In [112]:
df_test = pd.read_csv('test.csv')

pred = model_3.predict(test)
final = pd.DataFrame()
final['PassengerId'] = df_test['PassengerId']
final['Survived'] = pred

In [113]:
final.to_csv('Submission.csv', index=False)