In [198]:
import pandas as pd  # Import pandas for data manipulation and analysis
import numpy as np  # Import numpy for numerical operations
from sklearn.model_selection import train_test_split  # Import train_test_split for splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler, LabelEncoder  # Import StandardScaler for feature scaling and LabelEncoder for encoding target labels
from catboost import CatBoostClassifier  # Import CatBoostClassifier for CatBoost model
from sklearn import metrics  # Import metrics from sklearn for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score  # Import specific metrics for model evaluation
import warnings  # Import warnings to manage warnings
import category_encoders as ce  # Import category_encoders for encoding categorical features
import optuna  # Import Optuna for hyperparameter optimization
import pickle  # Import pickle for saving model
from sklearn.preprocessing import OneHotEncoder  # Import OneHotEncoder for one-hot encoding
from sklearn.pipeline import Pipeline  # Import Pipeline for constructing a pipeline of estimators
from sklearn.compose import ColumnTransformer  # Import ColumnTransformer for transforming columns separately

# Ignore warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

In [199]:
path = r"C:\Users\User\Desktop\Rashad\DATA\train.csv"  # File path to the CSV file containing the dataset

# Load dataset from Excel file
df = pd.read_csv(path)  # Load the dataset into a pandas DataFrame

# Set option to display all columns
pd.set_option('display.max_columns', None)  # Set pandas option to display all columns of the DataFrame

# Display the loaded dataset
df  # Print or display the DataFrame

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [200]:
df_copy = df.copy()  # Create a copy of the original DataFrame

In [201]:
all_columns = df_copy.columns.tolist()  # Get a list of all column names in the DataFrame
all_columns.remove('Survived')  # Remove the target column 'Survived' from the list of columns

# Initialize WOEEncoder with specified columns
encoder = ce.WOEEncoder(cols=all_columns)

# Transform specified columns using WOEEncoder, considering the target column 'Survived'
df_copy[all_columns] = encoder.fit_transform(df_copy[all_columns], df_copy['Survived'])

In [202]:
# Assign the 'diagnosis' column as the target variable
targets = df['Survived']

# Create inputs by dropping the 'diagnosis' column from the DataFrame
inputs = df_copy.drop('Survived', axis=1)

# Import necessary libraries
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42)

In [203]:
catboost_model_def = CatBoostClassifier()  # Initialize CatBoostClassifier model with default parameters

# Train the CatBoostClassifier model on the training data
catboost_model_def.fit(x_train, y_train)

Learning rate set to 0.008911
0:	learn: 0.6849880	total: 2.52ms	remaining: 2.52s
1:	learn: 0.6779839	total: 4.64ms	remaining: 2.31s
2:	learn: 0.6699262	total: 6.66ms	remaining: 2.21s
3:	learn: 0.6623651	total: 9.04ms	remaining: 2.25s
4:	learn: 0.6549088	total: 11.2ms	remaining: 2.23s
5:	learn: 0.6467765	total: 13.4ms	remaining: 2.22s
6:	learn: 0.6393075	total: 15.6ms	remaining: 2.21s
7:	learn: 0.6323373	total: 17.6ms	remaining: 2.19s
8:	learn: 0.6264490	total: 19.7ms	remaining: 2.17s
9:	learn: 0.6195127	total: 22ms	remaining: 2.18s
10:	learn: 0.6130068	total: 24.3ms	remaining: 2.19s
11:	learn: 0.6062471	total: 26.5ms	remaining: 2.18s
12:	learn: 0.6005285	total: 28.7ms	remaining: 2.18s
13:	learn: 0.5953201	total: 31.6ms	remaining: 2.23s
14:	learn: 0.5890126	total: 34.3ms	remaining: 2.25s
15:	learn: 0.5834267	total: 36.5ms	remaining: 2.25s
16:	learn: 0.5773387	total: 38.4ms	remaining: 2.22s
17:	learn: 0.5721460	total: 40.2ms	remaining: 2.19s
18:	learn: 0.5673225	total: 42ms	remaining: 2.

<catboost.core.CatBoostClassifier at 0x15dcefff310>

In [204]:
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    """
    Train and evaluate the given model on the training and testing data.

    Parameters:
    model_name (str): Name of the model for display purposes.
    model : Machine learning model object.
    X_train : Features of the training data.
    y_train : Target labels of the training data.
    X_test : Features of the testing data.
    y_test : Target labels of the testing data.

    Returns:
    float, float: Gini coefficients calculated from the model's predictions on training and testing data.
    """

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict labels and probabilities on the testing data
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]

    # Predict labels and probabilities on the training data
    y_train_pred = model.predict(X_train)
    y_train_prob = model.predict_proba(X_train)[:, 1]

    # Calculate ROC AUC and Gini coefficient for testing data
    roc_test_prob = roc_auc_score(y_test, y_test_prob)
    gini_test_prob = roc_test_prob * 2 - 1

    # Calculate ROC AUC and Gini coefficient for training data
    roc_train_prob = roc_auc_score(y_train, y_train_prob)
    gini_train_prob = roc_train_prob * 2 - 1

    # Calculate confusion matrix and classification report for testing data
    confusion_matrix_test_result = confusion_matrix(y_test, y_test_pred)
    classification_report_test_result = classification_report(y_test, y_test_pred)

    # Calculate confusion matrix and classification report for training data
    confusion_matrix_train_result = confusion_matrix(y_train, y_train_pred)
    classification_report_train_result = classification_report(y_train, y_train_pred)

    # Print model performance metrics
    print(f'Model Performance for {model_name}')
    print('Gini prob for testing data is', gini_test_prob * 100)
    print('Gini prob for training data is', gini_train_prob * 100)
    print('Classification Report for Testing Data:')
    print(classification_report_test_result)
    print('Confusion Matrix for Testing Data:')
    print(confusion_matrix_test_result)
    print('Classification Report for Training Data:')
    print(classification_report_train_result)
    print('Confusion Matrix for Training Data:')
    print(confusion_matrix_train_result)

    return gini_train_prob, gini_test_prob

# Assuming svc_model_def, x_train, y_train, x_test, y_test are defined
gini_df = pd.DataFrame(columns=['Model', 'Gini_train_prob', 'Gini_test_prob'])
gini_train_prob, gini_test_prob = train_and_evaluate_model('xgb', catboost_model_def, x_train, y_train, x_test, y_test)

# Add the result to the DataFrame using concat and sort it
new_row = pd.DataFrame([{'Model': 'svc', 'Gini_train_prob': gini_train_prob, 'Gini_test_prob': gini_test_prob}])
gini_df = pd.concat([gini_df, new_row], ignore_index=True)
gini_df_sorted = gini_df.sort_values(by='Gini_test_prob', ascending=False)

print(gini_df_sorted)

Learning rate set to 0.008911
0:	learn: 0.6849880	total: 2.81ms	remaining: 2.81s
1:	learn: 0.6779839	total: 5.7ms	remaining: 2.84s
2:	learn: 0.6699262	total: 8.01ms	remaining: 2.66s
3:	learn: 0.6623651	total: 10.1ms	remaining: 2.52s
4:	learn: 0.6549088	total: 12.4ms	remaining: 2.47s
5:	learn: 0.6467765	total: 14.6ms	remaining: 2.42s
6:	learn: 0.6393075	total: 16.5ms	remaining: 2.35s
7:	learn: 0.6323373	total: 18.4ms	remaining: 2.28s
8:	learn: 0.6264490	total: 20.2ms	remaining: 2.22s
9:	learn: 0.6195127	total: 22.1ms	remaining: 2.19s
10:	learn: 0.6130068	total: 24.1ms	remaining: 2.16s
11:	learn: 0.6062471	total: 26.3ms	remaining: 2.16s
12:	learn: 0.6005285	total: 28.6ms	remaining: 2.17s
13:	learn: 0.5953201	total: 30.7ms	remaining: 2.17s
14:	learn: 0.5890126	total: 32.8ms	remaining: 2.15s
15:	learn: 0.5834267	total: 34.8ms	remaining: 2.14s
16:	learn: 0.5773387	total: 36.9ms	remaining: 2.13s
17:	learn: 0.5721460	total: 38.9ms	remaining: 2.12s
18:	learn: 0.5673225	total: 40.9ms	remaining:

In [205]:
test_path = r"C:\Users\User\Desktop\Rashad\DATA\test.csv"

# Load dataset from Excel file
test_data = pd.read_csv(test_path)

# Set option to display all columns
pd.set_option('display.max_columns', None)

#Display the loaded dataset
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [206]:
trs_data = test_data.copy()  # Create a copy of the original test_data DataFrame

In [207]:
all_columns = trs_data.columns.tolist()  # Get a list of all column names in the trs_data DataFrame

# Use the previously initialized encoder to transform the specified columns in trs_data
trs_data[all_columns] = encoder.transform(trs_data[all_columns])

In [208]:
# Use the trained CatBoostClassifier model to predict the probability of survival for the test data
test_data['probability_Survived'] = catboost_model_def.predict_proba(trs_data.iloc[:,1:])[:,1]

# Display the updated test_data DataFrame with the predicted probabilities
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,probability_Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.228666
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0.175734
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.131012
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.202729
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0.658286
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0.044254
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,0.969376
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0.067692
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0.044254


In [209]:
data['Survived'] = (data['probability_Survived'] >= 0.5).astype(int)

In [210]:
data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,probability_Survived,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.228666,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0.175734,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.131012,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.202729,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0.658286,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0.044254,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,0.969376,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0.067692,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0.044254,0


In [215]:
submission_df = data[['PassengerId', 'Survived']]

In [216]:
submission_df.to_csv(r'C:\Users\User\Desktop\Rashad\Github\submission3.csv', index=False)

In [217]:
submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
