In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import Levenshtein
import re


def intersection_of_lists(list1, list2):
    return list(set(list1) & set(list2))


def difference_of_lists(list1, list2):
    return [item for item in list1 if item not in list2]


def get_numeric_and_non_numeric_columns(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
    print(f"Numeric columns: {numeric_cols}")
    print(f"Non-numeric columns: {non_numeric_cols}")
    return numeric_cols, non_numeric_cols


def remove_single_unique_or_all_nans(df):
    removed_columns = []
    for column in df.columns:
        if df[column].nunique() <= 1 or df[column].isna().all():
            removed_columns.append(column)
            df = df.drop(columns=[column])
    print(f"Removed columns due to all NaN or only 1 unique value: {removed_columns}")
    return df, removed_columns


def columns_with_missing_values(df):
    missing_cols = [col for col in df.columns if df[col].isna().any()]
    print(f"Missing data columns: {missing_cols}")
    return missing_cols


def fill_missingNumeric_with_median(df, missing_cols, numeric_cols):
    for col in intersection_of_lists(missing_cols, numeric_cols):
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)
    print("Done inputing missing numeric values with median!")
    return df


def columnsCategory_with_more_than_X_percent_unique(df, categoric_cols, perc):
    total_rows = len(df)
    threshold = total_rows * 0.01 * perc  # 10% of the total number of rows
    cols_with_high_uniques = [col for col in categoric_cols if df[col].nunique() > threshold]
    print(f"Columns with high uniques: {cols_with_high_uniques}")
    return cols_with_high_uniques
    


def convert_and_create_integer_columns(df, new_columns, mappings, colName):
    df[colName] = df[colName].astype('object')
    df[colName], unique_values = pd.factorize(df[colName])
    # Add the new column name to the list
    new_columns.append(colName)
    # Create a mapping dictionary for the column
    mappings[colName] = {value: i for i, value in enumerate(unique_values)}
    return df, new_columns, mappings


def fill_missing_and_predict(df, new_columns, mappings, usable_cols, column_name):
    # Convert and create integer column
    df, new_columns, mappings = convert_and_create_integer_columns(df, new_columns, mappings, column_name)
    # Train the model to predict missing values
    non_missing_idx = df[column_name] != -1  # Using -1 for factorized NaNs
    missing_idx = df[column_name] == -1
    if missing_idx.sum() > 0:
        X_train = df.loc[non_missing_idx, usable_cols]
        y_train = df.loc[non_missing_idx, column_name]
        X_test = df.loc[missing_idx,  usable_cols]
        model = LogisticRegression(max_iter=1000, solver ='lbfgs',  multi_class='auto')
        model.fit(X_train, y_train)
        # Predict the missing values
        predicted = model.predict(X_test)
        # Replace the missing values with the predicted values
        df.loc[missing_idx, column_name] = predicted
    return df, new_columns, mappings
    

def get_bigrams(string):
    # Generate bigrams from a string
    return [string[i:i+2] for i in range(len(string)-1)]

def sorensen_dice(a, b):
    # Sørensen-Dice coefficient for two sets
    a_bigrams = set(get_bigrams(a))
    b_bigrams = set(get_bigrams(b))
    overlap = len(a_bigrams & b_bigrams)
    total = len(a_bigrams) + len(b_bigrams)
    if total == 0:
        return 1.0 if a == b else 0.0  # Handle identical empty strings
    return 2 * overlap / total


def calculate_meanDistanceFromAList(input_string, string_list):
    sum_Levenshtein = 0
    sum_sorensen_dice = 0
    for string in string_list:
        sum_Levenshtein = sum_Levenshtein + Levenshtein.distance(input_string, string)
        sum_sorensen_dice = sum_sorensen_dice + sorensen_dice(input_string, string)
    return float(sum_Levenshtein/len(string_list)),float(sum_sorensen_dice/len(string_list))
    

def takeOut_stringList(df, target, variableCol):
    return list(df[df[f"{target}"]==1][f"{variableCol}"].unique()),list(df[df[f"{target}"]==0][f"{variableCol}"].unique())


def apply_meanDistance(df, column_name, string_list):
    # Calculate mean distances for each row and add a new column
    df[['mean_Levenshtein', 'mean_sorensen_dice']] = df[column_name].apply(
        lambda x: pd.Series(calculate_meanDistanceFromAList(x, string_list))
    )
    return df


def create_DistanceMetric(df, new_columns, usable_cols, colName, target, orig_data):
    df[colName] = df[colName].astype('str')
    true_NameList, false_NameList = takeOut_stringList(orig_data, target, colName)
    new_columns.append(colName)
    colName_true_lev = str(colName+"_true_lev")
    colName_true_reg = str(colName+"_true_reg")
    df[[colName_true_lev, colName_true_reg]] = df[colName].apply(
        lambda x: pd.Series(calculate_meanDistanceFromAList(x, true_NameList))
    )
    colName_false_lev = str(colName+"_false_lev")
    colName_false_reg = str(colName+"_false_reg")
    df[[colName_false_lev, colName_false_reg]] = df[colName].apply(
        lambda x: pd.Series(calculate_meanDistanceFromAList(x, false_NameList))
    )
    usable_cols = usable_cols + [colName_true_lev, colName_true_reg, colName_false_lev, colName_false_reg]
    return df, new_columns, usable_cols


def convert_All_integer_columns(df, numeric_cols, missing_cols, categoric_cols, cols_with_high_uniques, target, orig_data):
    new_columns = []
    mappings = {}
    usable_cols = numeric_cols 
    
    categoric_nonNA_cols = difference_of_lists(categoric_cols, missing_cols)

    categoric_nonNA_Few_cols = difference_of_lists(categoric_nonNA_cols, cols_with_high_uniques)
    
    categoric_nonNA_Multiple_cols = difference_of_lists(categoric_nonNA_cols, categoric_nonNA_Few_cols)
    
    categoric_NA_Few_cols = difference_of_lists(missing_cols, cols_with_high_uniques)

    categoric_NA_Multiple_cols = difference_of_lists(missing_cols, categoric_NA_Few_cols)
    
    for col in categoric_nonNA_Few_cols:
        df, new_columns, mappings = convert_and_create_integer_columns(df, new_columns, mappings, col)
        print(f"[No NA values][Less Unique Values] Categoric columns Converted to Integer: {col}")
    usable_cols = usable_cols + categoric_nonNA_Few_cols
    for col in  categoric_NA_Few_cols:   
        df, new_columns, mappings = fill_missing_and_predict(df, new_columns, mappings, usable_cols, col)
        print(f"[NA values][Less Unique Values] Categoric columns Converted to Integer and Missing Are Predicted: {col}")
        usable_cols = usable_cols + [col]    
    for col in categoric_nonNA_Multiple_cols:   
        df, new_columns, usable_cols = create_DistanceMetric(df, new_columns, usable_cols, col, target, orig_data)
        df = df.drop(columns=[col])
        print(f"[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: {col}")
    for col in  categoric_NA_Multiple_cols:
        df = df.drop(columns=[col])
        print(f"[NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster and Missing Are Predicted: {col}")  
    print(f"Mappings: {mappings}")
    return df, new_columns, mappings

In [39]:
class tangiD_BinaryClassification:
    '''Takes in the data , target and features'''
    def __init__(self, data, train, target, type, features):
        self.data = data
        self.target = target
        self.type = type
        if(self.type!="TestData!"):
            self.origData = self.data.copy()
        else:
            self.origData = train
        self.allFeatures = features
    
    def medianIntifying(self, highUniq = 10):
        self.cleanDF, self.removed_columns = remove_single_unique_or_all_nans(self.data[self.allFeatures].copy())
        self.numeric_cols, self.non_numeric_cols = get_numeric_and_non_numeric_columns(self.cleanDF)
        self.missing_cols = columns_with_missing_values(self.cleanDF)
        self.filledNumeric_df = fill_missingNumeric_with_median(self.cleanDF, self.missing_cols, self.numeric_cols)
        self.missing_cols = columns_with_missing_values(self.filledNumeric_df)
        self.high_uniques = columnsCategory_with_more_than_X_percent_unique(self.filledNumeric_df, self.non_numeric_cols, highUniq)
        self.updated_df, self.new_columns, self.mappings = convert_All_integer_columns(self.filledNumeric_df, self.numeric_cols, self.missing_cols, self.non_numeric_cols, self.high_uniques, self.target, self.origData)
        if(self.type!="TestData!"):
            self.updated_df = pd.concat([self.updated_df, self.data[self.target]], axis=1)
            return self.updated_df
        else:
            return self.updated_df


In [40]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
trainClass = tangiD_BinaryClassification(train_data, None, "Survived", "TrainData!", ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch','Ticket', 'Fare', 'Cabin', 'Embarked'])
trainData = trainClass.medianIntifying(10)
trainData

Removed columns due to all NaN or only 1 unique value: []
Numeric columns: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Non-numeric columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
Missing data columns: ['Age', 'Cabin', 'Embarked']
Done inputing missing numeric values with median!
Missing data columns: ['Cabin', 'Embarked']
Columns with high uniques: ['Name', 'Ticket', 'Cabin']
[No NA values][Less Unique Values] Categoric columns Converted to Integer: Sex
[NA values][Less Unique Values] Categoric columns Converted to Integer and Missing Are Predicted: Embarked
[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: Name
[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: Ticket
[NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster and Missing Are Predicted: Cabin
Mappings: {'Sex': {'male': 0, 'female': 1}, 'Embarked': {'S': 0, 'C': 1, 'Q': 2}}


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Name_true_lev,Name_true_reg,Name_false_lev,Name_false_reg,Ticket_true_lev,Ticket_true_reg,Ticket_false_lev,Ticket_false_reg,Survived
0,3,0,22.0,1,0,7.2500,0,23.330409,0.240875,18.391621,0.280404,8.076923,0.061181,8.010638,0.059223,0
1,1,1,38.0,1,0,71.2833,1,39.064327,0.222573,39.794171,0.199117,6.919231,0.101571,7.493617,0.049166,1
2,3,1,26.0,0,0,7.9250,0,23.897661,0.268535,20.091075,0.222354,13.261538,0.064463,13.236170,0.071896,1
3,1,1,35.0,1,0,53.1000,0,33.649123,0.225927,33.755920,0.199984,5.938462,0.078559,6.265957,0.055481,1
4,3,0,35.0,0,0,8.0500,0,23.242690,0.256739,18.566485,0.311537,6.100000,0.059899,6.093617,0.097651,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000,0,25.824561,0.129386,20.513661,0.148259,5.811538,0.061989,6.168085,0.067192,0
887,1,1,19.0,0,0,30.0000,0,24.210526,0.275001,22.284153,0.211829,5.846154,0.060876,6.104255,0.057290,1
888,3,1,28.0,1,2,23.4500,0,30.067251,0.288539,30.038251,0.231516,9.200000,0.032052,9.157447,0.038314,0
889,1,0,26.0,0,0,30.0000,1,23.023392,0.228507,18.191257,0.271455,5.961538,0.093097,6.304255,0.077616,1


In [42]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
testClass = tangiD_BinaryClassification(test_data, train_data, "Survived", "TestData!", ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch','Ticket', 'Fare', 'Cabin', 'Embarked'])
testData = testClass.medianIntifying(10)
testData

Removed columns due to all NaN or only 1 unique value: []
Numeric columns: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Non-numeric columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
Missing data columns: ['Age', 'Fare', 'Cabin']
Done inputing missing numeric values with median!
Missing data columns: ['Cabin']
Columns with high uniques: ['Name', 'Ticket', 'Cabin']
[No NA values][Less Unique Values] Categoric columns Converted to Integer: Sex
[No NA values][Less Unique Values] Categoric columns Converted to Integer: Embarked
[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: Name
[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: Ticket
[NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster and Missing Are Predicted: Cabin
Mappings: {'Sex': {'male': 0, 'female': 1}, 'Embarked': {'Q': 0, 'S': 1, 'C': 2}}


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Name_true_lev,Name_true_reg,Name_false_lev,Name_false_reg,Ticket_true_lev,Ticket_true_reg,Ticket_false_lev,Ticket_false_reg
0,3,0,34.5,0,0,7.8292,0,22.912281,0.238211,16.633880,0.306957,6.153846,0.068473,6.159574,0.047670
1,3,1,47.0,1,0,7.0000,1,25.918129,0.239974,24.313297,0.229048,5.996154,0.039048,5.887234,0.045391
2,2,0,62.0,0,0,9.6875,0,24.078947,0.230024,19.597450,0.270204,5.926923,0.035986,5.951064,0.039179
3,3,0,27.0,0,0,8.6625,1,22.967836,0.236747,17.063752,0.297216,5.869231,0.034021,5.912766,0.040689
4,3,1,22.0,1,1,12.2875,1,33.301170,0.255400,33.191257,0.222822,6.250000,0.065359,6.374468,0.064780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,27.0,0,0,8.0500,1,24.210526,0.207600,17.950820,0.269737,8.253846,0.049999,8.093617,0.064082
414,1,1,39.0,0,0,108.9000,2,27.643275,0.157274,24.136612,0.154155,6.823077,0.107360,7.487234,0.051602
415,3,0,38.5,0,0,7.2500,1,24.786550,0.229698,20.810565,0.269645,15.423077,0.062341,15.153191,0.076489
416,3,0,27.0,0,0,8.0500,1,22.692982,0.253124,17.306011,0.310065,6.180769,0.044993,5.985106,0.038481


In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()
test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
import pandas as pd



def intersection_of_lists(list1, list2):
    return list(set(list1) & set(list2))


def difference_of_lists(list1, list2):
    return [item for item in list1 if item not in list2]


def get_numeric_and_non_numeric_columns(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
    print(f"Numeric columns: {numeric_cols}")
    print(f"Non-numeric columns: {non_numeric_cols}")
    return numeric_cols, non_numeric_cols


def remove_single_unique_or_all_nans(df):
    removed_columns = []
    for column in df.columns:
        if df[column].nunique() <= 1 or df[column].isna().all():
            removed_columns.append(column)
            df = df.drop(columns=[column])
    print(f"Removed columns due to all NaN or only 1 unique value: {removed_columns}")
    return df, removed_columns


def columns_with_missing_values(df):
    missing_cols = [col for col in df.columns if df[col].isna().any()]
    print(f"Missing data columns: {missing_cols}")
    return missing_cols


def fill_missingNumeric_with_median(df, missing_cols, numeric_cols):
    for col in intersection_of_lists(missing_cols, numeric_cols):
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)
    print("Done inputing missing numeric values with median!")
    return df


def columnsCategory_with_more_than_X_percent_unique(df, categoric_cols, perc):
    total_rows = len(df)
    threshold = total_rows * 0.01 * perc  # 10% of the total number of rows
    cols_with_high_uniques = [col for col in categoric_cols if df[col].nunique() > threshold]
    print(f"Columns with high uniques: {cols_with_high_uniques}")
    return cols_with_high_uniques
    


def convert_and_create_integer_columns(df, new_columns, mappings, colName):
    df[colName] = df[colName].astype('object')
    df[colName], unique_values = pd.factorize(df[colName])
    # Add the new column name to the list
    new_columns.append(colName)
    # Create a mapping dictionary for the column
    mappings[colName] = {value: i for i, value in enumerate(unique_values)}
    return df, new_columns, mappings


def fill_missing_and_predict(df, new_columns, mappings, usable_cols, column_name):
    # Convert and create integer column
    df, new_columns, mappings = convert_and_create_integer_columns(df, new_columns, mappings, column_name)
    # Train the model to predict missing values
    non_missing_idx = df[column_name] != -1  # Using -1 for factorized NaNs
    missing_idx = df[column_name] == -1
    if missing_idx.sum() > 0:
        X_train = df.loc[non_missing_idx, usable_cols]
        y_train = df.loc[non_missing_idx, column_name]
        X_test = df.loc[missing_idx,  usable_cols]
        model = LogisticRegression(max_iter=1000, solver ='lbfgs',  multi_class='auto')
        model.fit(X_train, y_train)
        # Predict the missing values
        predicted = model.predict(X_test)
        # Replace the missing values with the predicted values
        df.loc[missing_idx, column_name] = predicted
    return df, new_columns, mappings
    

def get_bigrams(string):
    # Generate bigrams from a string
    return [string[i:i+2] for i in range(len(string)-1)]

def sorensen_dice(a, b):
    # Sørensen-Dice coefficient for two sets
    a_bigrams = set(get_bigrams(a))
    b_bigrams = set(get_bigrams(b))
    overlap = len(a_bigrams & b_bigrams)
    total = len(a_bigrams) + len(b_bigrams)
    if total == 0:
        return 1.0 if a == b else 0.0  # Handle identical empty strings
    return 2 * overlap / total


def calculate_meanDistanceFromAList(input_string, string_list):
    sum_Levenshtein = 0
    sum_sorensen_dice = 0
    for string in string_list:
        sum_Levenshtein = sum_Levenshtein + Levenshtein.distance(input_string, string)
        sum_sorensen_dice = sum_sorensen_dice + sorensen_dice(input_string, string)
    return float(sum_Levenshtein/len(string_list)),float(sum_sorensen_dice/len(string_list))
    

def takeOut_stringList(df, target, variableCol):
    return list(df[df[f"{target}"]==1][f"{variableCol}"].unique()),list(df[df[f"{target}"]==0][f"{variableCol}"].unique())


def apply_meanDistance(df, column_name, string_list):
    # Calculate mean distances for each row and add a new column
    df[['mean_Levenshtein', 'mean_sorensen_dice']] = df[column_name].apply(
        lambda x: pd.Series(calculate_meanDistanceFromAList(x, string_list))
    )
    return df


def create_DistanceMetric(df, new_columns, usable_cols, colName, target, orig_data):
    df[colName] = df[colName].astype('str')
    true_NameList, false_NameList = takeOut_stringList(orig_data, target, colName)
    new_columns.append(colName)
    colName_true_lev = str(colName+"_true_lev")
    colName_true_reg = str(colName+"_true_reg")
    df[[colName_true_lev, colName_true_reg]] = df[colName].apply(
        lambda x: pd.Series(calculate_meanDistanceFromAList(x, true_NameList))
    )
    colName_false_lev = str(colName+"_false_lev")
    colName_false_reg = str(colName+"_false_reg")
    df[[colName_false_lev, colName_false_reg]] = df[colName].apply(
        lambda x: pd.Series(calculate_meanDistanceFromAList(x, false_NameList))
    )
    usable_cols = usable_cols + [colName_true_lev, colName_true_reg, colName_false_lev, colName_false_reg]
    return df, new_columns, usable_cols


def convert_All_integer_columns(df, numeric_cols, missing_cols, categoric_cols, cols_with_high_uniques, target, orig_data):
    new_columns = []
    mappings = {}
    usable_cols = numeric_cols 
    
    categoric_nonNA_cols = difference_of_lists(categoric_cols, missing_cols)

    categoric_nonNA_Few_cols = difference_of_lists(categoric_nonNA_cols, cols_with_high_uniques)
    
    categoric_nonNA_Multiple_cols = difference_of_lists(categoric_nonNA_cols, categoric_nonNA_Few_cols)
    
    categoric_NA_Few_cols = difference_of_lists(missing_cols, cols_with_high_uniques)

    categoric_NA_Multiple_cols = difference_of_lists(missing_cols, categoric_NA_Few_cols)
    
    for col in categoric_nonNA_Few_cols:
        df, new_columns, mappings = convert_and_create_integer_columns(df, new_columns, mappings, col)
        print(f"[No NA values][Less Unique Values] Categoric columns Converted to Integer: {col}")
    usable_cols = usable_cols + categoric_nonNA_Few_cols
    for col in  categoric_NA_Few_cols:   
        df, new_columns, mappings = fill_missing_and_predict(df, new_columns, mappings, usable_cols, col)
        print(f"[NA values][Less Unique Values] Categoric columns Converted to Integer and Missing Are Predicted: {col}")
        usable_cols = usable_cols + [col]    
    for col in categoric_nonNA_Multiple_cols:   
        df, new_columns, usable_cols = create_DistanceMetric(df, new_columns, usable_cols, col, target, orig_data)
        df = df.drop(columns=[col])
        print(f"[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: {col}")
    for col in  categoric_NA_Multiple_cols:
        df = df.drop(columns=[col])
        print(f"[NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster and Missing Are Predicted: {col}")  
    print(f"Mappings: {mappings}")
    return df, new_columns, mappings

In [5]:
df_cleaned, removed_columns = remove_single_unique_or_all_nans(train_data[['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']].copy())
numeric_cols, non_numeric_cols = get_numeric_and_non_numeric_columns(df_cleaned)
missing_cols = columns_with_missing_values(df_cleaned)
filledNumeric_df = fill_missingNumeric_with_median(df_cleaned, missing_cols, numeric_cols)
missing_cols = columns_with_missing_values(filledNumeric_df)
high_uniques = columnsCategory_with_more_than_X_percent_unique(filledNumeric_df, non_numeric_cols, 10)
updated_df_train, new_columns_train, mappings_train = convert_All_integer_columns(filledNumeric_df, numeric_cols, missing_cols, non_numeric_cols, high_uniques, 'Survived', train_data)
train_df = pd.concat([updated_df_train, train_data['Survived']], axis=1)
train_df

Removed columns due to all NaN or only 1 unique value: []
Numeric columns: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Non-numeric columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
Missing data columns: ['Age', 'Cabin', 'Embarked']
Done inputing missing numeric values with median!
Missing data columns: ['Cabin', 'Embarked']
Columns with high uniques: ['Name', 'Ticket', 'Cabin']
[No NA values][Less Unique Values] Categoric columns Converted to Integer: Sex
[NA values][Less Unique Values] Categoric columns Converted to Integer and Missing Are Predicted: Embarked
[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: Name
[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: Ticket
[NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster and Missing Are Predicted: Cabin
Mappings: {'Sex': {'male': 0, 'female': 1}, 'Embarked': {'S': 0, 'C': 1, 'Q': 2}}


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Name_true_lev,Name_true_reg,Name_false_lev,Name_false_reg,Ticket_true_lev,Ticket_true_reg,Ticket_false_lev,Ticket_false_reg,Survived
0,3,0,22.0,1,0,7.2500,0,23.330409,0.240875,18.391621,0.280404,8.076923,0.061181,8.010638,0.059223,0
1,1,1,38.0,1,0,71.2833,1,39.064327,0.222573,39.794171,0.199117,6.919231,0.101571,7.493617,0.049166,1
2,3,1,26.0,0,0,7.9250,0,23.897661,0.268535,20.091075,0.222354,13.261538,0.064463,13.236170,0.071896,1
3,1,1,35.0,1,0,53.1000,0,33.649123,0.225927,33.755920,0.199984,5.938462,0.078559,6.265957,0.055481,1
4,3,0,35.0,0,0,8.0500,0,23.242690,0.256739,18.566485,0.311537,6.100000,0.059899,6.093617,0.097651,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000,0,25.824561,0.129386,20.513661,0.148259,5.811538,0.061989,6.168085,0.067192,0
887,1,1,19.0,0,0,30.0000,0,24.210526,0.275001,22.284153,0.211829,5.846154,0.060876,6.104255,0.057290,1
888,3,1,28.0,1,2,23.4500,0,30.067251,0.288539,30.038251,0.231516,9.200000,0.032052,9.157447,0.038314,0
889,1,0,26.0,0,0,30.0000,1,23.023392,0.228507,18.191257,0.271455,5.961538,0.093097,6.304255,0.077616,1


In [38]:
df_cleaned, removed_columns = remove_single_unique_or_all_nans(test_data[['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']].copy())
numeric_cols, non_numeric_cols = get_numeric_and_non_numeric_columns(df_cleaned)
missing_cols = columns_with_missing_values(df_cleaned)
filledNumeric_df = fill_missingNumeric_with_median(df_cleaned, missing_cols, numeric_cols)
missing_cols = columns_with_missing_values(filledNumeric_df)
high_uniques = columnsCategory_with_more_than_X_percent_unique(filledNumeric_df, non_numeric_cols, 10)
updated_df_test, new_columns_test, mappings_test = convert_All_integer_columns(filledNumeric_df, numeric_cols, missing_cols, non_numeric_cols, high_uniques, 'Survived', train_data)
updated_df_test

Removed columns due to all NaN or only 1 unique value: []
Numeric columns: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Non-numeric columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
Missing data columns: ['Age', 'Fare', 'Cabin']
Done inputing missing numeric values with median!
Missing data columns: ['Cabin']
Columns with high uniques: ['Name', 'Ticket', 'Cabin']
[No NA values][Less Unique Values] Categoric columns Converted to Integer: Sex
[No NA values][Less Unique Values] Categoric columns Converted to Integer: Embarked
[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: Name
[No NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster: Ticket
[NA values][Multiple Unique Values] Categoric columns Converted to Distance Based On Cluster and Missing Are Predicted: Cabin
Mappings: {'Sex': {'male': 0, 'female': 1}, 'Embarked': {'Q': 0, 'S': 1, 'C': 2}}


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Name_true_lev,Name_true_reg,Name_false_lev,Name_false_reg,Ticket_true_lev,Ticket_true_reg,Ticket_false_lev,Ticket_false_reg
0,3,0,34.5,0,0,7.8292,0,22.912281,0.238211,16.633880,0.306957,6.153846,0.068473,6.159574,0.047670
1,3,1,47.0,1,0,7.0000,1,25.918129,0.239974,24.313297,0.229048,5.996154,0.039048,5.887234,0.045391
2,2,0,62.0,0,0,9.6875,0,24.078947,0.230024,19.597450,0.270204,5.926923,0.035986,5.951064,0.039179
3,3,0,27.0,0,0,8.6625,1,22.967836,0.236747,17.063752,0.297216,5.869231,0.034021,5.912766,0.040689
4,3,1,22.0,1,1,12.2875,1,33.301170,0.255400,33.191257,0.222822,6.250000,0.065359,6.374468,0.064780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,27.0,0,0,8.0500,1,24.210526,0.207600,17.950820,0.269737,8.253846,0.049999,8.093617,0.064082
414,1,1,39.0,0,0,108.9000,2,27.643275,0.157274,24.136612,0.154155,6.823077,0.107360,7.487234,0.051602
415,3,0,38.5,0,0,7.2500,1,24.786550,0.229698,20.810565,0.269645,15.423077,0.062341,15.153191,0.076489
416,3,0,27.0,0,0,8.0500,1,22.692982,0.253124,17.306011,0.310065,6.180769,0.044993,5.985106,0.038481


In [7]:
import pandas as pd
import numpy as np
import joblib
import statsmodels.api as sm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

def very_fast_backward_feature_selection(data, target, n_features_to_select, models_dict, n_jobs):
    # Prepare the feature matrix and target vector
    X = data.drop(columns=[target])
    y = data[target]
    
    results = {}
    
    for model_name, model in models_dict.items():
        # Initialize SFS with the model
        sfs = SFS(model, 
                  k_features=n_features_to_select, 
                  forward=False, 
                  floating=False, 
                  scoring='accuracy', 
                  cv=2, 
                  n_jobs=n_jobs)
        
        # Fit SFS
        sfs = sfs.fit(X, y)
        
        # Get the names of the selected features
        selected_features = list(sfs.k_feature_names_)
        
        # Fit model using statsmodels for p-values and coefficients
        X_selected = sm.add_constant(X[selected_features])
        sm_model = sm.OLS(y, X_selected).fit()
        
        summary = sm_model.summary2().tables[1]
        
        # Print the summary
        print(f"Model: {model_name}")
        print(sm_model.summary())
        
        # Store the selected features and model summary
        results[model_name] = {
            'selected_features': selected_features,
            'model_summary': summary
        }
    
    return results


def optimize_models(models_dict, X, y, n_trials=20, n_jobs=1):
    best_models = {}
    best_scores = {}

    # Objective function to optimize
    def objective(trial, model_name):
        model = models_dict[model_name]

        if model_name == 'RandomForestClassifier':
            model.set_params(
                n_estimators=trial.suggest_int('n_estimators', 1, 10),
                max_depth=trial.suggest_int('max_depth', 1, 4),
                min_samples_split=trial.suggest_int('min_samples_split', 2, 4),
                min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 4)
            )
            
        elif model_name == 'GradientBoostingClassifier':
            model.set_params(
                n_estimators=trial.suggest_int('n_estimators', 1, 10),
                # learning_rate=trial.suggest_float('learning_rate', 0.01, 0.1),
                max_depth=trial.suggest_int('max_depth', 1, 4)
            )
            
        elif model_name == 'XGBClassifier':
            model.set_params(
                n_estimators=trial.suggest_int('n_estimators', 1, 10),
                # learning_rate=trial.suggest_float('learning_rate', 0.01, 0.1),
                max_depth=trial.suggest_int('max_depth', 1, 4)
            )
            
        elif model_name == 'LGBMClassifier':
            model.set_params(
                n_estimators=trial.suggest_int('n_estimators', 1, 10),
                # learning_rate=trial.suggest_float('learning_rate', 0.01, 0.1),
                max_depth=trial.suggest_int('max_depth', 1, 4)
            )
            
        elif model_name == 'KNeighborsClassifier':
            model.set_params(
                n_neighbors=trial.suggest_int('n_neighbors', 1, 10),
                leaf_size=trial.suggest_int('leaf_size', 10, 30),
                p=trial.suggest_int('p', 1, 2)
            )
            
        elif model_name == 'SupportVectorClassifier':
            model.set_params(
                # C=trial.suggest_float('C', 0.1, 10.0),
                kernel=trial.suggest_categorical('kernel', ['linear', 'rbf']),
                gamma=trial.suggest_categorical('gamma', ['scale', 'auto'])
            )

        elif model_name == 'DecisionTreeClassifier':
            model.set_params(
                max_depth=trial.suggest_int('max_depth', 1, 3),
                min_samples_split=trial.suggest_int('min_samples_split', 2, 4),
                min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 4)
            )

        # Perform cross-validation
        scores = cross_val_score(model, X, y, cv=3, scoring='accuracy')
        return scores.mean()

    # Create a study object and optimize the objective function for each model
    for model_name in models_dict.keys():
        print(f"Optimizing {model_name}...")
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: objective(trial, model_name), n_trials=n_trials, n_jobs=n_jobs)  # Use n_jobs for parallel execution
        
        # Store best hyperparameters and score
        best_models[model_name] = study.best_params
        best_scores[model_name] = study.best_value

        # Print best hyperparameters and score
        print(f"Best hyperparameters for {model_name}: {study.best_params}")
        print(f"Best score for {model_name}: {study.best_value}\n")

    return best_models, best_scores


def update_model_params(models_dict, best_params):
    # Update each model with the best hyperparameters
    for model_name, params in best_params.items():
        model = models_dict[model_name]
        model.set_params(**params)
    return models_dict


def fit_models(models_dict, X, y, save_path='/kaggle/working/'):
    fitted_models = {}

    for model_name, model in models_dict.items():
        print(f"Fitting {model_name}...")
        # Fit the model
        model.fit(X, y)
        
        # Save the fitted model
        model_filename = f"{save_path}{model_name}.joblib"
        joblib.dump(model, model_filename)
        
        # Store the model in the dictionary
        fitted_models[model_name] = model

    return fitted_models

In [8]:
FS_models_dict = {
    'LogisticRegression': LogisticRegression(max_iter=1000, solver='liblinear', penalty='l2'),
    'SupportVectorClassifier': SVC(probability=True, gamma='auto')
}
final_models_dict = {
    'LogisticRegression': LogisticRegression(max_iter=1000, solver='liblinear', penalty='l2'),
    'RandomForestClassifier': RandomForestClassifier(),
    'SupportVectorClassifier': SVC(probability=True, C=1),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'NaiveBayes': GaussianNB(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.01),
    'XGBClassifier': XGBClassifier(use_label_encoder=False, eval_metric='logloss', learning_rate=0.01),
    'LGBMClassifier': LGBMClassifier()
}

In [9]:
results = very_fast_backward_feature_selection(train_df, 'Survived', n_features_to_select=1, models_dict=FS_models_dict, n_jobs=1)
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']
best_models, best_scores = optimize_models(final_models_dict, X, y, n_trials=1, n_jobs=1)  # Using n_jobs=4 for parallel execution
print("Best models:", best_models)
print("Best scores:", best_scores)
to_fitModels = update_model_params(final_models_dict, best_models)
fitted_models = fit_models(to_fitModels, X, y, save_path='/kaggle/working/')
print("Models fitted and saved.")

In [10]:
import joblib
import pandas as pd

def predict_with_models(models_dict, X_new, save_path='/kaggle/working/'):
    predictions = {}
    
    for model_name in models_dict.keys():
        try:
            # Load the fitted model
            model_filename = f"{save_path}{model_name}.joblib"
            model = joblib.load(model_filename)
            
            # Make predictions
            predictions[model_name] = model.predict(X_new)
        except:
            pass
    
    # Convert predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions)
    
    return predictions_df
predictions_df = predict_with_models(final_models_dict, updated_df_test, save_path='/kaggle/working/')
predictions_df.head(418)

In [12]:
estimators1 = [
    ('LogisticRegression', LogisticRegression(max_iter=1000, solver='liblinear', penalty='l2')),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=2, max_depth=3, min_samples_split=4, min_samples_leaf=3)),
    ('SupportVectorClassifier', SVC(probability=True, C=1, kernel='linear', gamma='scale')),
    ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=7, leaf_size=23, p=2)),
    ('NaiveBayes', GaussianNB()),
    ('DecisionTreeClassifier', DecisionTreeClassifier(max_depth=1, min_samples_split=2, min_samples_leaf=3)),
    ('GradientBoostingClassifier', GradientBoostingClassifier(n_estimators=5, max_depth=2, learning_rate=0.01)),
    ('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss',n_estimators=7, max_depth=3,learning_rate=0.01)),
    ('LGBMClassifier', LGBMClassifier(n_estimators=2, max_depth=4))
]

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']
VotingClass = VotingClassifier(estimators=estimators1, voting='hard')
VotingClass.fit(X, y)

VotingClassifier(estimators=[('LogisticRegression',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('RandomForestClassifier',
                              RandomForestClassifier(bootstra...
                                             importance_type='split',
                                             learning_rate=0.1, max_depth=4,
 

In [27]:
predictions = VotingClass.predict(updated_df_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
