In [597]:
import pandas as pd  # Import pandas for data manipulation and analysis
import numpy as np  # Import numpy for numerical operations
from sklearn.model_selection import train_test_split  # Import train_test_split for splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler, LabelEncoder  # Import StandardScaler for feature scaling and LabelEncoder for encoding target labels
from sklearn.linear_model import LogisticRegression  # Import LogisticRegression for logistic regression model
from sklearn.ensemble import RandomForestClassifier, StackingClassifier  # Import RandomForestClassifier for random forest model and StackingClassifier for model stacking
from sklearn.svm import SVC  # Import SVC for support vector classifier
from xgboost import XGBClassifier  # Import XGBClassifier for XGBoost model
from lightgbm import LGBMClassifier  # Import LGBMClassifier for LightGBM model
from catboost import CatBoostClassifier  # Import CatBoostClassifier for CatBoost model
from sklearn import metrics  # Import metrics from sklearn for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score  # Import specific metrics for model evaluation
import warnings  # Import warnings to manage warnings
import category_encoders as ce  # Import category_encoders for encoding categorical features
import optuna
# Ignore warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
import pickle

In [598]:
path = r"C:\Users\User\Desktop\Rashad\DATA\train.csv"  # File path to the CSV file containing the dataset

# Load dataset from Excel file
data = pd.read_csv(path)  # Load the dataset into a pandas DataFrame

# Set option to display all columns
pd.set_option('display.max_columns', None)  # Set pandas option to display all columns of the DataFrame

# Display the loaded dataset
data  # Print or display the DataFrame

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [599]:
df = data.copy()

In [600]:
df['Ticket_Prefix'] = data['Ticket'].astype(str).str.split().str[0] # Extract prefix (first part before space)

In [601]:
ticket_group_sizes = df.groupby('Ticket')['Ticket'].transform('size') # Get group size for each ticket
df['Ticket_Group_Size'] = ticket_group_sizes

df['Ticket_Is_Numeric'] = df['Ticket'].astype(str).str.isnumeric().astype(int) # 1 if numeric, 0 otherwise

In [602]:
# Fare Feature Engineering (handling missing values)
df['Fare'] = df['Fare'].fillna(df['Fare'].median()) # Fill missing fares with median

df['Fare_Bin'] = pd.qcut(df['Fare'], 4, labels=['low', 'medium-low', 'medium-high', 'high']) # Create fare bins using quantiles

In [603]:
df['Log_Fare'] = df['Fare'].apply(lambda x: np.log(x) if x > 0 else 0) # Log transform fare

In [604]:
df['Fare_Pclass_Interaction'] = df['Fare'] * df['Pclass']

In [605]:
df.drop(['Ticket', 'Fare'], axis=1, inplace=True) 

In [606]:
df['Cabin_letter'] = df['Cabin'].astype(str).str[0]

In [607]:
df.drop(['Cabin'], axis=1, inplace=True) 

In [608]:
# Extract Title
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) # Extract title between a space and a period

In [609]:
# Replace less frequent titles with 'Rare'
rare_titles = ['Dona', 'Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer']
df['Title'] = df['Title'].replace(rare_titles, 'Rare')

# Extract Last Name
df['Last_Name'] = df['Name'].str.split(',').str[0]

# Calculate Name Length
df['Name_Length'] = df['Name'].str.len()

In [610]:
df.drop(['Name'], axis=1, inplace=True) 

In [611]:
df.drop(['PassengerId'], axis=1, inplace=True) 

In [612]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,Ticket_Prefix,Ticket_Group_Size,Ticket_Is_Numeric,Fare_Bin,Log_Fare,Fare_Pclass_Interaction,Cabin_letter,Title,Last_Name,Name_Length
0,0,3,male,22.0,1,0,S,A/5,1,0,low,1.981001,21.7500,n,Mr,Braund,23
1,1,1,female,38.0,1,0,C,PC,1,0,high,4.266662,71.2833,C,Mrs,Cumings,51
2,1,3,female,26.0,0,0,S,STON/O2.,1,0,medium-low,2.070022,23.7750,n,Miss,Heikkinen,22
3,1,1,female,35.0,1,0,S,113803,2,1,high,3.972177,53.1000,C,Mrs,Futrelle,44
4,0,3,male,35.0,0,0,S,373450,1,1,medium-low,2.085672,24.1500,n,Mr,Allen,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,S,211536,1,1,medium-low,2.564949,26.0000,n,Rare,Montvila,21
887,1,1,female,19.0,0,0,S,112053,1,1,medium-high,3.401197,30.0000,B,Miss,Graham,28
888,0,3,female,,1,2,S,W./C.,2,0,medium-high,3.154870,70.3500,n,Miss,Johnston,40
889,1,1,male,26.0,0,0,C,111369,1,1,medium-high,3.401197,30.0000,C,Mr,Behr,21


In [613]:
# Columns to apply One-Hot Encoding
one_hot_cols = ['Sex', 'Embarked', 'Fare_Bin', 'Cabin_letter', 'Title']

# One-Hot Encoding
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid dummy variable trap
one_hot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(df[one_hot_cols]), 
                               columns=one_hot_encoder.get_feature_names_out(one_hot_cols))

# Merge one-hot encoded columns and drop original columns
df = df.join(one_hot_encoded).drop(columns=one_hot_cols)


In [614]:
# Save the mean encoding mappings during training for Ticket_Prefix and Last_Name
encoding_mappings = {}
target_cols = ['Ticket_Prefix', 'Last_Name']
for col in target_cols:
    mean_encoded = df.groupby(col)['Survived'].mean()
    encoding_mappings[col] = df[col].map(mean_encoded)
    df[col] = df[col].map(mean_encoded)
    
    
def apply_encoding_mappings(data, encoding_mappings):
    for col, mapping in encoding_mappings.items():
        data[col] = data[col].map(mapping).fillna(mapping.mean())  # Replace unseen categories with overall mean
    return data

In [570]:
# Columns to apply Manual Target Encoding
target_cols = ['Ticket_Prefix', 'Last_Name']

# Manual Target Encoding
for col in target_cols:
    mean_encoded = df.groupby(col)['Survived'].mean()
    df[col] = df[col].map(mean_encoded)

In [615]:
df.drop(['Embarked_nan', 'Title_Mlle', 'Title_Mme', 'Cabin_letter_T'], axis=1, inplace=True) 

In [616]:
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Ticket_Prefix,Ticket_Group_Size,Ticket_Is_Numeric,Log_Fare,Fare_Pclass_Interaction,Last_Name,Name_Length,Sex_male,Embarked_Q,Embarked_S,Fare_Bin_low,Fare_Bin_medium-high,Fare_Bin_medium-low,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_n,Title_Miss,Title_Mr,Title_Mrs,Title_Ms,Title_Rare
0,0,3,22.0,1,0,0.100000,1,0,1.981001,21.7500,0.000000,23,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,1,1,38.0,1,0,0.650000,1,0,4.266662,71.2833,1.000000,51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,3,26.0,0,0,0.500000,1,0,2.070022,23.7750,1.000000,22,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,1,1,35.0,1,0,0.500000,2,1,3.972177,53.1000,0.500000,44,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,3,35.0,0,0,0.000000,1,1,2.085672,24.1500,0.500000,24,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,0.000000,1,1,2.564949,26.0000,0.000000,21,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
887,1,1,19.0,0,0,1.000000,1,1,3.401197,30.0000,0.666667,28,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
888,0,3,,1,2,0.111111,2,0,3.154870,70.3500,0.000000,40,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
889,1,1,26.0,0,0,1.000000,1,1,3.401197,30.0000,1.000000,21,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [617]:
# Assign the 'diagnosis' column as the target variable
targets = df['Survived']

# Create inputs by dropping the 'diagnosis' column from the DataFrame
inputs = df.drop('Survived', axis=1)

# Import necessary libraries
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42)

In [618]:
catboost_model_def = CatBoostClassifier()  # Initialize CatBoostClassifier model with default parameters

# Train the CatBoostClassifier model on the training data
catboost_model_def.fit(x_train, y_train)

Learning rate set to 0.008911
0:	learn: 0.6763175	total: 3.89ms	remaining: 3.89s
1:	learn: 0.6594462	total: 7.39ms	remaining: 3.69s
2:	learn: 0.6429667	total: 11ms	remaining: 3.65s
3:	learn: 0.6222512	total: 14.4ms	remaining: 3.58s
4:	learn: 0.6068123	total: 17.7ms	remaining: 3.53s
5:	learn: 0.5903887	total: 21.5ms	remaining: 3.57s
6:	learn: 0.5751464	total: 24.9ms	remaining: 3.53s
7:	learn: 0.5617287	total: 28.4ms	remaining: 3.52s
8:	learn: 0.5468117	total: 32.1ms	remaining: 3.54s
9:	learn: 0.5313439	total: 35.4ms	remaining: 3.5s
10:	learn: 0.5195068	total: 39.3ms	remaining: 3.53s
11:	learn: 0.5054471	total: 43.6ms	remaining: 3.59s
12:	learn: 0.4925322	total: 48ms	remaining: 3.64s
13:	learn: 0.4795020	total: 51ms	remaining: 3.59s
14:	learn: 0.4686072	total: 53.8ms	remaining: 3.53s
15:	learn: 0.4562004	total: 56.7ms	remaining: 3.48s
16:	learn: 0.4462150	total: 59.6ms	remaining: 3.45s
17:	learn: 0.4354432	total: 63.6ms	remaining: 3.47s
18:	learn: 0.4248236	total: 66.5ms	remaining: 3.43s

<catboost.core.CatBoostClassifier at 0x21d9b13b9d0>

In [619]:
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    """
    Train and evaluate the given model on the training and testing data.

    Parameters:
    model_name (str): Name of the model for display purposes.
    model : Machine learning model object.
    X_train : Features of the training data.
    y_train : Target labels of the training data.
    X_test : Features of the testing data.
    y_test : Target labels of the testing data.

    Returns:
    float, float: Gini coefficients calculated from the model's predictions on training and testing data.
    """

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict labels and probabilities on the testing data
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]

    # Predict labels and probabilities on the training data
    y_train_pred = model.predict(X_train)
    y_train_prob = model.predict_proba(X_train)[:, 1]

    # Calculate ROC AUC and Gini coefficient for testing data
    roc_test_prob = roc_auc_score(y_test, y_test_prob)
    gini_test_prob = roc_test_prob * 2 - 1

    # Calculate ROC AUC and Gini coefficient for training data
    roc_train_prob = roc_auc_score(y_train, y_train_prob)
    gini_train_prob = roc_train_prob * 2 - 1

    # Calculate confusion matrix and classification report for testing data
    confusion_matrix_test_result = confusion_matrix(y_test, y_test_pred)
    classification_report_test_result = classification_report(y_test, y_test_pred)

    # Calculate confusion matrix and classification report for training data
    confusion_matrix_train_result = confusion_matrix(y_train, y_train_pred)
    classification_report_train_result = classification_report(y_train, y_train_pred)

    # Print model performance metrics
    print(f'Model Performance for {model_name}')
    print('Gini prob for testing data is', gini_test_prob * 100)
    print('Gini prob for training data is', gini_train_prob * 100)
    print('Classification Report for Testing Data:')
    print(classification_report_test_result)
    print('Confusion Matrix for Testing Data:')
    print(confusion_matrix_test_result)
    print('Classification Report for Training Data:')
    print(classification_report_train_result)
    print('Confusion Matrix for Training Data:')
    print(confusion_matrix_train_result)

    return gini_train_prob, gini_test_prob

# Assuming svc_model_def, x_train, y_train, x_test, y_test are defined
gini_df = pd.DataFrame(columns=['Model', 'Gini_train_prob', 'Gini_test_prob'])
gini_train_prob, gini_test_prob = train_and_evaluate_model('xgb', catboost_model_def, x_train, y_train, x_test, y_test)

# Add the result to the DataFrame using concat and sort it
new_row = pd.DataFrame([{'Model': 'Catboost', 'Gini_train_prob': gini_train_prob, 'Gini_test_prob': gini_test_prob}])
gini_df = pd.concat([gini_df, new_row], ignore_index=True)
gini_df_sorted = gini_df.sort_values(by='Gini_test_prob', ascending=False)

gini_df_sorted

Learning rate set to 0.008911
0:	learn: 0.6763175	total: 4.1ms	remaining: 4.09s
1:	learn: 0.6594462	total: 8.2ms	remaining: 4.09s
2:	learn: 0.6429667	total: 11.6ms	remaining: 3.85s
3:	learn: 0.6222512	total: 15.3ms	remaining: 3.82s
4:	learn: 0.6068123	total: 19.7ms	remaining: 3.92s
5:	learn: 0.5903887	total: 23.7ms	remaining: 3.92s
6:	learn: 0.5751464	total: 27.1ms	remaining: 3.85s
7:	learn: 0.5617287	total: 30ms	remaining: 3.72s
8:	learn: 0.5468117	total: 33.5ms	remaining: 3.68s
9:	learn: 0.5313439	total: 36.4ms	remaining: 3.6s
10:	learn: 0.5195068	total: 39.3ms	remaining: 3.53s
11:	learn: 0.5054471	total: 42.2ms	remaining: 3.47s
12:	learn: 0.4925322	total: 45ms	remaining: 3.42s
13:	learn: 0.4795020	total: 48.3ms	remaining: 3.4s
14:	learn: 0.4686072	total: 51.3ms	remaining: 3.37s
15:	learn: 0.4562004	total: 54.3ms	remaining: 3.34s
16:	learn: 0.4462150	total: 57.2ms	remaining: 3.31s
17:	learn: 0.4354432	total: 60.2ms	remaining: 3.28s
18:	learn: 0.4248236	total: 63.1ms	remaining: 3.26s


Unnamed: 0,Model,Gini_train_prob,Gini_test_prob
0,Catboost,0.999992,0.997941


In [620]:
gini_df_sorted

Unnamed: 0,Model,Gini_train_prob,Gini_test_prob
0,Catboost,0.999992,0.997941


In [621]:
dep_path = r"C:\Users\User\Desktop\Rashad\DATA\tittest.csv"  # File path to the CSV file containing the dataset

# Load dataset from Excel file
dep_data = pd.read_csv(dep_path)  # Load the dataset into a pandas DataFrame

# Set option to display all columns
pd.set_option('display.max_columns', None)  # Set pandas option to display all columns of the DataFrame

# Display the loaded dataset
dep_data  # Print or display the DataFrame

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [622]:
dep = dep_data.copy()

In [623]:
dep.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [624]:
dep['Ticket_Prefix'] = dep['Ticket'].astype(str).str.split().str[0] # Extract prefix (first part before space)

ticket_group_sizes = dep.groupby('Ticket')['Ticket'].transform('size') # Get group size for each ticket

dep['Ticket_Group_Size'] = ticket_group_sizes

dep['Ticket_Is_Numeric'] = dep['Ticket'].astype(str).str.isnumeric().astype(int) # 1 if numeric, 0 otherwise

In [625]:
# Fare Feature Engineering (handling missing values)
dep['Fare'] = dep['Fare'].fillna(dep['Fare'].median())  # Fill missing fares with median

# Create fare bins using quantiles
dep['Fare_Bin'] = pd.qcut(dep['Fare'], 4, labels=['low', 'medium-low', 'medium-high', 'high'])

# Log transform fare
dep['Log_Fare'] = dep['Fare'].apply(lambda x: np.log(x) if x > 0 else 0)

# Interaction between fare and class
dep['Fare_Pclass_Interaction'] = dep['Fare'] * dep['Pclass']

In [626]:
dep['Cabin_letter'] = dep['Cabin'].astype(str).str[0]

# Extract Title
dep['Title'] = dep['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) # Extract title between a space and a period

# Replace less frequent titles with 'Rare'
rare_titles = ['Dona', 'Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer']
dep['Title'] = dep['Title'].replace(rare_titles, 'Rare')

# Extract Last Name
dep['Last_Name'] = dep['Name'].str.split(',').str[0]

# Calculate Name Length
dep['Name_Length'] = dep['Name'].str.len()

In [627]:
dep.drop(['Ticket', 'Fare','Name','PassengerId','Cabin'], axis=1, inplace=True) 

In [628]:
# Columns to apply One-Hot Encoding
one_hot_cols = ['Sex', 'Embarked', 'Fare_Bin', 'Cabin_letter', 'Title']

# One-Hot Encoding
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid dummy variable trap
one_hot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(dep[one_hot_cols]), 
                               columns=one_hot_encoder.get_feature_names_out(one_hot_cols))

# Merge one-hot encoded columns and drop original columns
dep = dep.join(one_hot_encoded).drop(columns=one_hot_cols)

In [629]:
dep = apply_encoding_mappings(dep, encoding_mappings)

In [630]:
dep

Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket_Prefix,Ticket_Group_Size,Ticket_Is_Numeric,Log_Fare,Fare_Pclass_Interaction,Last_Name,Name_Length,Sex_male,Embarked_Q,Embarked_S,Fare_Bin_low,Fare_Bin_medium-high,Fare_Bin_medium-low,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_n,Title_Miss,Title_Mr,Title_Mrs,Title_Ms,Title_Rare
0,3,34.5,0,0,0.383838,1,1,2.057860,23.4876,0.383838,16,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,3,47.0,1,0,0.383838,1,1,1.945910,21.0000,0.383838,32,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,2,62.0,0,0,0.383838,1,1,2.270836,19.3750,0.383838,25,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,3,27.0,0,0,0.383838,1,1,2.159003,25.9875,0.383838,16,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,3,22.0,1,1,0.383838,1,1,2.508582,36.8625,0.383838,44,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,,0,0,0.383838,1,0,2.085672,24.1500,0.383838,18,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
414,1,39.0,0,0,0.383838,1,0,4.690430,108.9000,0.383838,28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
415,3,38.5,0,0,0.383838,1,0,1.981001,21.7500,0.383838,28,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
416,3,,0,0,0.383838,1,1,2.085672,24.1500,0.383838,19,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [631]:
dep_data['Embarked'].isnull().sum()

0

In [632]:
dep.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Ticket_Prefix', 'Ticket_Group_Size',
       'Ticket_Is_Numeric', 'Log_Fare', 'Fare_Pclass_Interaction', 'Last_Name',
       'Name_Length', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'Fare_Bin_low',
       'Fare_Bin_medium-high', 'Fare_Bin_medium-low', 'Cabin_letter_B',
       'Cabin_letter_C', 'Cabin_letter_D', 'Cabin_letter_E', 'Cabin_letter_F',
       'Cabin_letter_G', 'Cabin_letter_n', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Title_Ms', 'Title_Rare'],
      dtype='object')

In [633]:
df.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Ticket_Prefix',
       'Ticket_Group_Size', 'Ticket_Is_Numeric', 'Log_Fare',
       'Fare_Pclass_Interaction', 'Last_Name', 'Name_Length', 'Sex_male',
       'Embarked_Q', 'Embarked_S', 'Fare_Bin_low', 'Fare_Bin_medium-high',
       'Fare_Bin_medium-low', 'Cabin_letter_B', 'Cabin_letter_C',
       'Cabin_letter_D', 'Cabin_letter_E', 'Cabin_letter_F', 'Cabin_letter_G',
       'Cabin_letter_n', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Ms',
       'Title_Rare'],
      dtype='object')

In [634]:
# Find the columns that are different between the two lists
train_not_in_deploy = set(df.columns) - set(dep.columns)
deploy_not_in_train = set(dep.columns) - set(df.columns)

print("Columns in training data but not in deployment data:", train_not_in_deploy)
print("Columns in deployment data but not in training data:", deploy_not_in_train)

Columns in training data but not in deployment data: {'Survived'}
Columns in deployment data but not in training data: set()


In [637]:
# Reorder the columns in the deployment data to match the order of columns in the training data
dep_reordered = dep.reindex(columns=df.columns)

In [638]:
dep_data['probability_Survived'] = catboost_model_def.predict_proba(dep_reordered.iloc[:,1:])[:,1]

In [640]:
dep_data['Survived'] = (dep_data['probability_Survived'] >= 0.5).astype(int)

In [641]:
dep_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,probability_Survived,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.207112,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0.583839,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.202895,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.302264,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0.699117,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0.222464,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,0.219831,0
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0.302943,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0.177837,0


In [642]:
submission_df = dep_data[['PassengerId', 'Survived']]

In [644]:
submission_df.to_csv(r'C:\Users\User\Desktop\Rashad\Github\submission4.csv', index=False)