In [None]:
pip install Levenshtein

In [None]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_csv_column_names(folder_path, num_workers=4):
    column_names_dict = {}
    
    # Function to read column names from a CSV file
    def read_columns(file_path):
        try:
            df = pd.read_csv(file_path, nrows=0)
            return file_path, df.columns.tolist()
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return file_path, []

    # Traverse the directory and get all CSV file paths
    csv_files = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))

    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        future_to_file = {executor.submit(read_columns, file): file for file in csv_files}
        
        for future in as_completed(future_to_file):
            file_path, columns = future.result()
            file_name = os.path.basename(file_path)
            column_names_dict[file_name] = columns

    return column_names_dict

# Example usage
folder_path = 'path/to/your/folder'
num_workers = 4
column_names_dict = get_csv_column_names(folder_path, num_workers)
print(column_names_dict)




In [None]:
column_names_dict

In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
# concatenating first train data and test data, helpfull for applying same imputation and feature enginnearing.
data = pd.concat([data, test], ignore_index=True)
data

In [None]:
data['NoOfPassenger'] = data["PassengerId"].apply(lambda x: int(x[-2:]))  # 03 passenger
data['PassengerId'] = data["PassengerId"].apply(lambda x: x[:4])          # 0013 passenger id
data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
data['ExpenseInShip'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']
sbsp = data.groupby(("PassengerId")).NoOfPassenger.count()
data = pd.merge(data, sbsp, on='PassengerId', how='inner')
data['SibSp'] = data['NoOfPassenger_y']
data.drop(columns= ['NoOfPassenger_x', 'NoOfPassenger_y'], inplace=True)
data['surname'] = data.Name.apply(lambda x: x.split()[1] if type(x) == str else x)
data['Cabin'] = data['Cabin'].apply(lambda x: str(x).split('/') if pd.notna(x) and x != -1 else x)
data['deck'] = data['Cabin'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else np.nan)
data['num'] = data['Cabin'].apply(lambda x: x[1] if isinstance(x, list) and len(x) > 1 else np.nan)
data['side'] = data['Cabin'].apply(lambda x: x[2] if isinstance(x, list) and len(x) > 2 else np.nan)

In [None]:
plt.pie(data.HomePlanet.value_counts(),labels = ['Earth','Europa','Mars'])
plt.title("HomePlanet")
plt.show()

In [None]:
data[data.VIP == True].HomePlanet.value_counts()

In [None]:
data[(data.VIP == True) & (data.HomePlanet == "Mars")].CryoSleep.value_counts()

In [None]:
# function for fillng missing values
def fill_homeplanet(row):
    if pd.notna(row['HomePlanet']):
        return row['HomePlanet']

    if (pd.notna(row['VIP']) and pd.notna(row['CryoSleep'])):
        if row['VIP'] and row['CryoSleep'] == False:              # if VIP = True and Cryosleep = False
            return 'Mars'

    if pd.notna(row['VIP']):
        return 'Europa' if row['VIP'] else 'Earth'
    return row['HomePlanet']

data['HomePlanet'] = data.apply(fill_homeplanet, axis=1)

In [None]:
data.HomePlanet.isnull().sum()

In [None]:
data.HomePlanet.fillna("Earth", inplace=True)

In [None]:
data[(data.ExpenseInShip == 0.0) & (data.CryoSleep == False) & (data.Age <= 18)].VIP.value_counts()

In [None]:
data[(data.ExpenseInShip == 0.0) & (data.Age > 18)].CryoSleep.value_counts()

In [None]:
def fill_cryosleep(row):
    if pd.notna(row['CryoSleep']):
        return row['CryoSleep']
    if row['ExpenseInShip'] == 0 and row['Age'] <= 18 and row['VIP'] == False:
        return False
    if row['ExpenseInShip'] == 0:
        return True
    else:
        return False
data['CryoSleep'] = data.apply(fill_cryosleep, axis=1)

In [None]:
data.Cabin.isnull().sum()

In [None]:
data[data.PassengerId == '0992']
# see same passenger_id travelling in the same Cabin.

In [None]:
# taking mode of Cabin, where same passenger id 
def get_mode(data, passId):
    subset = data[data.PassengerId == passId]
    fill_with = subset.Cabin.mode()
    if len(fill_with) <= 0:
        return np.nan
    return fill_with[0]

subset = data[data.Cabin.isnull()]

msID = subset.PassengerId
filled = []
for i in msID:
    filled.append(get_mode(data,i))
    

my_d = pd.DataFrame()
my_d['PassengerId'] = msID
my_d['filled_cabin'] = filled

merged_df = data.merge(my_d, on='PassengerId', how='left')
merged_df['filled_cabin'].fillna(merged_df['Cabin'], inplace=True)
merged_df.rename(columns={'filled_cabin': 'Cabin1'}, inplace=True)

merged_df.Cabin = merged_df.Cabin.astype(str)
merged_df.Cabin1 = merged_df.Cabin1.astype(str)

data.drop(columns = 'Cabin', inplace=True)
data.rename(columns={'Cabin1': 'Cabin'}, inplace=True)

merged_df = merged_df.drop_duplicates()

data = merged_df
data.drop(columns = 'Cabin', inplace=True)
data.rename(columns={'Cabin1': 'Cabin'}, inplace=True)

In [None]:
import ast
def convert_to_list(s):
    if s != 'nan':  # Check if the value is not NaN
        return ast.literal_eval(s)
    return None


# Apply the conversion function to the Series
data.Cabin = data.Cabin.apply(lambda x: convert_to_list(x))

In [None]:
data['deck'] = data.Cabin.apply(lambda x: x[0] if isinstance(x, list) else np.nan)
data['num'] = data.Cabin.apply(lambda x: x[1] if isinstance(x, list) else np.nan)
data['side'] = data.Cabin.apply(lambda x: x[2] if isinstance(x, list) else np.nan)

In [None]:
data

In [None]:
data.deck.fillna(data.deck.mode()[0], inplace=True)
subset_data = data[data['deck'] == 'F']
# knn-imputation
X = subset_data['num'].values.reshape(-1, 1)
knn_imputer = KNNImputer(n_neighbors=4)
imputed_values = knn_imputer.fit_transform(X)
data.loc[data['deck'] == 'F', 'num'] = imputed_values
data.side.fillna(data.side.mode()[0], inplace=True)
data.drop(columns = ['Cabin'], inplace = True)
missing_destination_surname = data[data.Destination.isnull()]['surname'].values[0]
subset_data = data[data['surname'] == missing_destination_surname]
mode_destination = subset_data['Destination'].mode().values[0]
data.loc[data['Destination'].isnull(), 'Destination'] = mode_destination
data.drop(columns = ['Name', 'surname','PassengerId'], inplace=True)
data.Age.fillna(data.Age.mean(), inplace = True)
data.VIP.fillna(data.VIP.mode()[0], inplace = True)

In [None]:
data['CryoSleep'] = data['CryoSleep'].astype(object)
data['VIP'] = data['VIP'].astype(object)
data['SibSp'] = data['SibSp'].astype(object)
# data['Transported'] = data['Transported'].astype(int)

In [None]:
data['num'] = data['num'].astype(int)

In [None]:
train_data = data.loc[:8692]
train_data['Transported'] = train_data['Transported'].astype(int)
test_data = data.loc[8693:].drop(columns = 'Transported')

In [None]:
train_data

In [None]:
import numpy as np 
import pandas as pd 
import os
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import re
from itertools import combinations
import Levenshtein
from collections import defaultdict
from scipy.interpolate import LSQUnivariateSpline
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score


class generic_Utilities:
    '''Generic Utilities on Lists and Dataframes'''
    def __init__(self):
        self.allFunctions = {}
        self.allFunctions['List_Operartion_1'] = str("Function: intersection_of_lists(list1, list2), Returns : list3")
        self.allFunctions['List_Operartion_2'] = str("Function: difference_of_lists(list1, list2), Returns : list4")
        
        self.allFunctions['Folder_Operartion_1'] = str("Function: get_csv_column_names(folder_path, num_workers=4), Returns : column_names_dict")
        self.allFunctions['Folder_Operartion_2'] = str("Function: walk_through_folder(folder_path), Returns : None") 
        
        self.allFunctions['Plot_Operartion_1'] = str("Function: plot_NumericVscumSum(x, y), Returns : None")
        self.allFunctions['Plot_Operartion_2'] = str("Function: logistic_regression_with_roc(X, y), Returns: test_roc_auc")
        self.allFunctions['Plot_Operartion_3'] = str("Function: cart_with_roc(X, y), Returns: test_roc_auc")
        
        self.allFunctions['Dictionary_Operartion_1'] = str("Function: filter_and_sort_subsets(subset_counts, threshold), Returns: sorted_subsets")
        
        self.allFunctions['Dataframe_Operartion_1'] = str("Function: get_numeric_and_non_numeric_columns(df), Returns: list4, list5")
        self.allFunctions['Dataframe_Operartion_2'] = str("Function: remove_single_unique_or_all_nans(df), Returns: df")
        self.allFunctions['Dataframe_Operartion_3'] = str("Function: columns_with_missing_values(df), Returns: list6")
        self.allFunctions['Dataframe_Operartion_4'] = str("Function: fill_col_with_median(df, colNames), Returns: df")
        self.allFunctions['Dataframe_Operartion_5'] = str("Function: columns_with_more_than_X_percent_unique(df, colNames, perc), Returns: list7")
        self.allFunctions['Dataframe_Operartion_6'] = str("Function: convert_and_create_factorizedColumns(df, colNames), Returns: df, mapDict")
        self.allFunctions['Dataframe_Operartion_7'] = str("Function: fillMissing_predictFactorizedColumns(df, usable_cols, colName), Returns: df, mapDict")
        self.allFunctions['Dataframe_Operartion_9'] = str("Function: oneHotEncoded(df, columns_to_oneHot), Returns: new_df")
        
        self.allFunctions['FeatureEngineering_Operartion_1'] = str("Function: train_subset_counts_oneHot(df, target, value, options= COUNT, WIG, empericalProb, n_jobs=1), Returns: subset_counts")
        self.allFunctions['FeatureEngineering_Operartion_2'] = str("Function: train_LeastSquareSpline_fit(df, target, variable, degree), Returns: breakpoints_original")
        self.allFunctions['FeatureEngineering_Operartion_3'] = str("Function: train_UnivariateSpline_fit(df, target, variable, threshold), Returns: breakpoints_original")
        self.allFunctions['FeatureEngineering_Operartion_4'] = str("Function: train_cart_bins_with_plot(df, variableCol, targetCol, max_n_bins, n_jobs=1), Returns: breakpoints_original")
        
        self.allFunctions['FeatureCreation_Operartion_1'] = str("Function: create_combinedFeatures_df(df, required_cols, wanted_subsets,  options = INT, FRACTION), Returns: new_df")
        self.allFunctions['FeatureCreation_Operartion_2'] = str("Function: create_one_hot_encode_ranges(df, colName, required_columns, breakpoints), Returns: new_df")
       
        self.loadedFunctions = {}
        return None
        

    def walk_through_folder(self, folder_path):
        self.loadedFunctions['Folder_Operartion_2'] = str("Function: walk_through_folder(folder_path), Returns : None") 
        for root, dirs, files in os.walk(folder_path):
            print(f"Current directory: {root}")
            print("Subdirectories:", dirs)
            print("Files:", files)
            print()

        
    def get_csv_column_names(self, folder_path, num_workers=4):
        self.loadedFunctions['Folder_Operartion_1'] = str("Function: get_csv_column_names(folder_path, num_workers=4), Returns : column_names_dict")
        column_names_dict = {}

        # Function to read column names from a CSV file
        def read_columns(file_path):
            try:
                df = pd.read_csv(file_path, nrows=0)
                return file_path, df.columns.tolist()
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                return file_path, []

        # Traverse the directory and get all CSV file paths
        csv_files = []
        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.endswith('.csv'):
                    csv_files.append(os.path.join(root, file))

        # Use ThreadPoolExecutor for parallel processing
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            future_to_file = {executor.submit(read_columns, file): file for file in csv_files}

            for future in as_completed(future_to_file):
                file_path, columns = future.result()
                file_name = os.path.basename(file_path)
                column_names_dict[file_name] = columns

        return column_names_dict    
        
        
    def intersection_of_lists(self, list1, list2):
        self.loadedFunctions['List_Operartion_1'] = str("Function: intersection_of_lists(list1, list2), Returns : list3")
        return list(set(list1) & set(list2))


    def difference_of_lists(self, list1, list2):
        self.loadedFunctions['List_Operartion_2'] = str("Function: difference_of_lists(list1, list2), Returns : list4")
        return [item for item in list1 if item not in list2]
    
    
    def plot_NumericVscumSum(self, x, y):
        self.loadedFunctions['Plot_Operartion_1'] = str("Function: plot_NumericVscumSum(x, y), Returns : None")
        modified_y = np.cumsum(y)/(np.sum(y)+0.0000000000001)
        plt.plot(x, modified_y, marker='o', linestyle='-', color='b')
        plt.xlabel("X-axis")
        plt.ylabel("Y-axis")
        plt.grid(True)
        plt.show()
    
    
    def get_numeric_and_non_numeric_columns(self, df):
        self.loadedFunctions['Dataframe_Operartion_1'] = str("Function: get_numeric_and_non_numeric_columns(df), Returns: list4, list5")
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
        print(f"Numeric columns: {numeric_cols}")
        print(f"Non-numeric columns: {non_numeric_cols}")
        return numeric_cols, non_numeric_cols


    def remove_single_unique_or_all_nans(self, df):
        self.loadedFunctions['Dataframe_Operartion_2'] = str("Function: remove_single_unique_or_all_nans(df), Returns: df")
        removed_columns = []
        for column in df.columns:
            if df[column].nunique() <= 1 or df[column].isna().all():
                removed_columns.append(column)
                df = df.drop(columns=[column])
        print(f"Removed columns due to all NaN or only 1 unique value: {removed_columns}")
        return df


    def columns_with_missing_values(self, df):
        self.loadedFunctions['Dataframe_Operartion_3'] = str("Function: columns_with_missing_values(df), Returns: list6")
        missing_cols = [col for col in df.columns if df[col].isna().any()]
        print(f"Missing data columns: {missing_cols}")
        return missing_cols


    def fill_col_with_median(self, df, colNames):
        self.loadedFunctions['Dataframe_Operartion_4'] = str("Function: fill_col_with_median(df, colNames), Returns: df")
        try:
            for col in colNames:
                median_value = df[col].median()
                df.fillna({col: median_value}, inplace=True)
                print("Done inputing missing numeric values with median for column :" + str(col))
            return df
        except:
            print("Columns that are not numeric might be included! Please Check. Returning original dataframe." )
            return df
            
        

    def columns_with_more_than_X_percent_unique(self, df, colNames, perc):
        self.loadedFunctions['Dataframe_Operartion_5'] = str("Function: columns_with_more_than_X_percent_unique(df, colNames, perc), Returns: list7")
        total_rows = len(df)
        threshold = total_rows * 0.01 * perc  
        cols_with_high_uniques = [col for col in colNames if df[col].nunique() > threshold]
        print(f"Columns with high uniques , >= {perc} %  of number of rows in the data: {cols_with_high_uniques}")
        return cols_with_high_uniques
    


    def convert_and_create_factorizedColumns(self, df, colNames):
        self.loadedFunctions['Dataframe_Operartion_6'] = str("Function: convert_and_create_factorizedColumns(df, colNames), Returns: df, mapDict")
        mapDict = {}
        try:
            for colName in colNames:
                df[colName] = df[colName].astype('object')
                df[colName], unique_values = pd.factorize(df[colName])
                # Create a mapping dictionary for the column
                mapDict[colName] = {value: i for i, value in enumerate(unique_values)}
            return df, mapDict
        except:
            print("Columns with missing values might be included! Please Check. Returning original dataframe and a empty dictionary" )
            return df, mapDict

        
    def fillMissing_predictFactorizedColumns(self, df, usable_cols, colName):
        self.loadedFunctions['Dataframe_Operartion_7'] = str("Function: fillMissing_predictFactorizedColumns(df, usable_cols, colName), Returns: df, mapDict")
        mapDict = {}
        try:
            df[colName] = df[colName].astype('object')
            df[colName], unique_values = pd.factorize(df[colName])
            mapDict[colName] = {value: i for i, value in enumerate(unique_values)}
            # Train the model to predict missing values
            non_missing_idx = df[colName] != -1  # Using -1 for factorized NaNs
            missing_idx = df[colName] == -1
            if missing_idx.sum() > 0:
                X_train = df.loc[non_missing_idx, usable_cols]
                y_train = df.loc[non_missing_idx, colName]
                print(y_train)
                X_test = df.loc[missing_idx,  usable_cols]
                model = LogisticRegression(max_iter=1000, solver ='lbfgs',  multi_class='auto')
                model.fit(X_train, y_train)
                # Predict the missing values
                predicted = model.predict(X_test)
                print(predicted)
                # Replace the missing values with the predicted values
                df.loc[missing_idx, colName] = predicted
            return df, mapDict
        except:
            print("Columns with non-numeric values might be included! Please Check. Returning original dataframe and a empty dictionary" )
            return df, mapDict
    
    
    def apply_meanDistance(self, df, colName, string_list):
        self.loadedFunctions['Dataframe_Operartion_8'] = str("Function: apply_meanDistance(df, colName, string_list), Returns: df")
        def calculate_meanDistanceFromAList(input_string, string_list):
            def sorensen_dice(a, b):
                def get_bigrams(string):
                # Generate bigrams from a string
                    return [string[i:i+2] for i in range(len(string)-1)]
            # Sørensen-Dice coefficient for two sets
                a_bigrams = set(get_bigrams(a))
                b_bigrams = set(get_bigrams(b))
                overlap = len(a_bigrams & b_bigrams)
                total = len(a_bigrams) + len(b_bigrams)
                if total == 0:
                    return 1.0 if a == b else 0.0  # Handle identical empty strings
                return 2 * overlap / total
            sum_Levenshtein = 0
            sum_sorensen_dice = 0
            for string in string_list:
                sum_Levenshtein = sum_Levenshtein + Levenshtein.distance(input_string, string)
                sum_sorensen_dice = sum_sorensen_dice + sorensen_dice(input_string, string)
            return float(sum_Levenshtein/len(string_list)),float(sum_sorensen_dice/len(string_list))
        # Calculate mean distances for each row and add a new column
        df[['mean_Levenshtein', 'mean_sorensen_dice']] = df[colName].apply(
            lambda x: pd.Series(calculate_meanDistanceFromAList(x, string_list))
        )
        return df
    
    
    def oneHotEncoded(self, df, columns_to_oneHot):
        self.loadedFunctions['Dataframe_Operartion_9'] = str("Function: oneHotEncoded(df, columns_to_oneHot), Returns: new_df")
        # Perform one-hot encoding on specified columns
        df_encoded = pd.get_dummies(df, columns=columns_to_oneHot, drop_first=True, dtype=int)
        return df_encoded

    
    def filter_and_sort_subsets(self, subset_counts, threshold):
        self.loadedFunctions['Dictionary_Operartion_1'] = str("Function: filter_and_sort_subsets(subset_counts, threshold), Returns: sorted_subsets")
        # Filter subsets based on the given threshold
        filtered_subsets = {subset: count for subset, count in subset_counts.items() if count > threshold}
        # Sort the filtered subsets based on their counts in descending order
        sorted_subsets = sorted(filtered_subsets.items(), key=lambda item: item[1], reverse=True)
        return sorted_subsets
    
    
    def create_combinedFeatures_df(self, df, required_cols, wanted_subsets, options):
        self.loadedFunctions['FeatureCreation_Operartion_1'] = str("Function: create_combinedFeatures_df(df, required_cols, wanted_subsets,  options = INT, FRACTION), Returns: new_df")
        orig_df = df.copy()
        for subset, _ in wanted_subsets:
            new_col_name = "_".join(subset) + "_combined"
            if(options=='FRACTION'):
                orig_df[new_col_name] = df[list(subset)].mean(axis=1)
            else:
                orig_df[new_col_name] = df[list(subset)].min(axis=1)

        # Return a dataframe with the id, target, and new fraction columns
        new_combined_columns = [("_".join(subset) + "_combined") for subset, _ in wanted_subsets]
        selected_columns = required_cols + new_combined_columns
        return orig_df[selected_columns]
    
    
    def create_one_hot_encode_ranges(self, df, colName, required_columns, breakpoints):
        self.loadedFunctions['FeatureCreation_Operartion_2'] = str("Function: create_one_hot_encode_ranges(df, colName, required_columns, breakpoints), Returns: new_df")
        # Ensure breakpoints are sorted
        breakpoints = sorted(breakpoints)

        # Create a new DataFrame with the required columns
        new_df = df[required_columns].copy()

        # Create one-hot encoded columns based on breakpoints
        for i in range(len(breakpoints) - 1):
            lower_bound = breakpoints[i]
            upper_bound = breakpoints[i + 1]
            col_name = f"{colName}_{lower_bound:.2f}to{upper_bound:.2f}"
            new_df[col_name] = np.where((df[colName] > lower_bound) & (df[colName] <= upper_bound), 1, 0)

        return new_df
    
    
    def train_subset_counts_oneHot(self, df, target, value, options, n_jobs=1):
        self.loadedFunctions['FeatureEngineering_Operartion_1'] = str("Function: train_subset_counts_oneHot(df, target, value, options= COUNT, WIG, empericalProb , n_jobs=1), Returns: subset_counts")
        # Filter rows where the target equals value
        df_target_value = df[df[target] == value].copy()
        valSum = df_target_value[target].sum()

        # Drop target column to get only the one-hot encoded columns
        one_hot_columns = df_target_value.drop(columns=[target]).columns

        # Dictionary to store subsets and their counts
        subset_counts = defaultdict(int)

        # Helper function to check if a subset has more than two one-hot columns from the same original column
        def valid_subset(subset):
            original_cols = [col.split('_')[0] for col in subset]
            return all(original_cols.count(col) <= 1 for col in original_cols)

        # Function to calculate counts for a subset
        def calculate_count(subset):
            subset_df = df_target_value[list(subset)]
            count = (subset_df.sum(axis=1) == len(subset)).sum()

            if options == 'COUNT':
                return subset, count
            elif options == 'WIG':
                wig_value = (1 / (len(subset) + 1)) * (count / len(df_target_value)) - (1 / 2 ** len(subset))
                return subset, wig_value
            elif options == 'empericalProb':
                emp_prob = count / valSum
                return subset, emp_prob
            else:
                return subset, count

        all_combinations = []
        for r in range(1, len(one_hot_columns) + 1):
            for subset in combinations(one_hot_columns, r):
                if valid_subset(subset):
                    all_combinations.append(subset)

        with ThreadPoolExecutor(max_workers=n_jobs) as executor:
            futures = [executor.submit(calculate_count, subset) for subset in all_combinations]
            for future in as_completed(futures):
                subset, count = future.result()
                subset_counts[subset] = count

        return dict(subset_counts)
    
    
    def train_LeastSquareSpline_fit(self, df, target, variable, degree):
        self.loadedFunctions['FeatureEngineering_Operartion_2'] = str("Function: train_LeastSquareSpline_fit(df, target, variable, degree), Returns: breakpoints_original")
        # 1. Sort the dataframe based on the variable column.
        df_sorted = df.sort_values(by=variable)

        # 2. Convert all values in the sorted variable column to values between [0,1].
        df_sorted['TranformedVariable'] = (df_sorted[variable] - df_sorted[variable].min()) / (df_sorted[variable].max() - df_sorted[variable].min())

        # 3. Convert the target column to Cumulative Sum divided By Total Sum so that it is also between [0,1].
        df_sorted[target] = df_sorted[target].cumsum() / df_sorted[target].sum()

        # 4. Fit the best linear spline on the modified target based on the modified variable.
        # Define knot points (as degree + 1 points excluding the endpoints)
        num_knots = degree 
        knots = np.linspace(0, 1, num_knots + 2)[1:-1]  # exclude 0 and 1 as knots

        spline = LSQUnivariateSpline(df_sorted['TranformedVariable'], df_sorted[target], t=knots, k=degree)

        # 5. Return a list which contains all the break points of the fitted spline based on the original variable column
        breakpoints = spline.get_knots()
        breakpoints_original = df_sorted[variable].min() + breakpoints * (df_sorted[variable].max() - df_sorted[variable].min())
        return breakpoints_original
    
    
    def train_UnivariateSpline_fit(self, df, target, variable, threshold):
        self.loadedFunctions['FeatureEngineering_Operartion_3'] = str("Function: train_UnivariateSpline_fit(df, target, variable, threshold), Returns: breakpoints_original")

        # 1. Sort the dataframe based on the variable column.
        df_sorted = df.sort_values(by=variable)

        # 2. Convert all values in the sorted variable column to values between [0,1].
        df_sorted['TranformedVariable'] = (df_sorted[variable] - df_sorted[variable].min()) / (df_sorted[variable].max() - df_sorted[variable].min())

        # 3. Convert the target column to Cumulative Sum divided By Total Sum so that it is also between [0,1].
        df_sorted[target] = df_sorted[target].cumsum() / df_sorted[target].sum()

        spline = UnivariateSpline(df_sorted['TranformedVariable'], df_sorted[target], s=threshold)

        # 5. Return a list which contains all the break points of the fitted spline based on the original variable column
        breakpoints = spline.get_knots()
        breakpoints_original = df_sorted[variable].min() + breakpoints * (df_sorted[variable].max() - df_sorted[variable].min())

        return breakpoints_original
    
    
    def train_cart_bins_with_plot(self, df, variableCol, targetCol, max_n_bins, n_jobs=1):
        self.loadedFunctions['FeatureEngineering_Operartion_4'] = str("Function: train_cart_bins_with_plot(df, variableCol, targetCol, max_n_bins, n_jobs=1), Returns: breakpoints_original")
        best_auc = 0
        best_bins = []
        best_model = None

        def fit_cart_model(leaf_nodes):
            cart_model = DecisionTreeClassifier(max_leaf_nodes=leaf_nodes, random_state=42)
            cart_model.fit(df[[variableCol]], df[targetCol])
            predictions = cart_model.predict_proba(df[[variableCol]])[:, 1]
            auc = roc_auc_score(df[targetCol], predictions)
            thresholds = cart_model.tree_.threshold
            thresholds = thresholds[thresholds != -2]  # Remove dummy thresholds
            bins = sorted(thresholds)
            bins = [df[variableCol].min()] + bins + [df[variableCol].max()]
            return auc, cart_model, bins

        with ThreadPoolExecutor(max_workers=n_jobs) as executor:
            futures = {executor.submit(fit_cart_model, leaf_nodes): leaf_nodes for leaf_nodes in range(max_n_bins, 1, -1)}

            for future in futures:
                try:
                    auc, cart_model, bins = future.result()
                    if auc > best_auc:
                        best_auc = auc
                        best_model = cart_model
                        best_bins = bins
                    else:
                        break
                except Exception as e:
                    print(f"An error occurred with leaf_nodes {futures[future]}: {e}")

        # Plotting
        plt.figure(figsize=(10, 6))
        plt.scatter(df[variableCol], df[targetCol], color='blue', label='Actual Values')

        for i in range(len(best_bins) - 1):
            df_pred = pd.DataFrame({variableCol: [best_bins[i], best_bins[i+1]]})
            plt.plot([best_bins[i], best_bins[i+1]], [best_model.predict(df_pred)[0], best_model.predict(df_pred)[1]], color='red', linewidth=2)

        plt.xlabel(variableCol)
        plt.ylabel(targetCol)
        plt.title('CART Fitted Model vs Actual Values')
        plt.legend()
        plt.show()
        return best_bins
    
    
    def logistic_regression_with_roc(self, X, y):
        self.loadedFunctions['Plot_Operartion_2'] = str("Function: logistic_regression_with_roc(X, y), Returns: test_roc_auc")
        # 1. Split the data into X, y train and X, y test with a proportion of test 0.2 randomly
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # 2. Fit the data on the training set using sm.Logit, print the summary of the fit using lgbfs
        logit_model = sm.Logit(y_train, sm.add_constant(X_train)).fit(method='lbfgs')
        print(logit_model.summary())

        # 3. Predict on the Test Data
        y_train_pred = logit_model.predict(sm.add_constant(X_train))
        y_test_pred = logit_model.predict(sm.add_constant(X_test))

        # 4. Print The Test and Train roc_auc
        train_roc_auc = roc_auc_score(y_train, y_train_pred)
        test_roc_auc = roc_auc_score(y_test, y_test_pred)
        print(f"Train ROC AUC: {train_roc_auc}")
        print(f"Test ROC AUC: {test_roc_auc}")

        # 5. Plot the ROC_AUC for the model
        fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred)
        fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred)

        plt.figure(figsize=(10, 6))
        plt.plot(fpr_train, tpr_train, label=f"Train ROC AUC = {train_roc_auc:.2f}", color='blue')
        plt.plot(fpr_test, tpr_test, label=f"Test ROC AUC = {test_roc_auc:.2f}", color='red')
        plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend()
        plt.show()

        # 6. Return the test roc_auc as a float
        return test_roc_auc
    
    
    def cart_with_roc(self, X, y):
        self.loadedFunctions['Plot_Operartion_3'] = str("Function: cart_with_roc(X, y), Returns: test_roc_auc")
        # 1. Split the data into X, y train and X, y test with a proportion of test 0.2 randomly
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # 2. Fit the data on the training set using CART
        cart_model = DecisionTreeClassifier(max_leaf_nodes=2*X_train.shape[1], random_state=42)
        cart_model.fit(X_train, y_train)

        # 3. Predict on the Test Data
        y_train_pred = cart_model.predict_proba(X_train)[:, 1]
        y_test_pred = cart_model.predict_proba(X_test)[:, 1]

        # 4. Print The Test and Train roc_auc
        train_roc_auc = roc_auc_score(y_train, y_train_pred)
        test_roc_auc = roc_auc_score(y_test, y_test_pred)
        print(f"Train ROC AUC: {train_roc_auc}")
        print(f"Test ROC AUC: {test_roc_auc}")

        # 5. Plot the ROC_AUC for the model
        fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred)
        fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred)

        plt.figure(figsize=(10, 6))
        plt.plot(fpr_train, tpr_train, label=f"Train ROC AUC = {train_roc_auc:.2f}", color='blue')
        plt.plot(fpr_test, tpr_test, label=f"Test ROC AUC = {test_roc_auc:.2f}", color='red')
        plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend()
        plt.show()

        # 6. Return the test roc_auc as a float
        return test_roc_auc

In [None]:
space_titanic_utils = generic_Utilities()
space_titanic_utils.allFunctions

In [None]:
space_titanic_utils.columns_with_missing_values(test_data)

In [None]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, roc_curve


class logitModelBuilder_BinaryClassification:
    '''Takes in the train, test , target and identifier columns'''
    def __init__(self, train, test, target, identifier_columns):
        self.origTrain = train
        self.origTest = test
        self.target = target
        self.identifiers = identifier_columns 
        self.genericUtils = generic_Utilities()
        self.numericCols , self.nonNumericCols = self.genericUtils.get_numeric_and_non_numeric_columns(self.origTest)
        self.masterDF = self.genericUtils.oneHotEncoded(pd.concat([self.origTrain,self.origTest], axis=0), self.nonNumericCols)
        self.baselineCols = self.genericUtils.difference_of_lists(self.masterDF.columns, self.numericCols+[self.target])
        self.loadedModels = self.genericUtils.train_subset_counts_oneHot
        return None

    
    def update_BaselineLogit(self):
        self.baseline_logit_ROC_AUC = self.genericUtils.logistic_regression_with_roc(self.masterDF[self.baselineCols].loc[:len(self.origTrain)-1], self.masterDF[self.target].loc[:len(self.origTrain)-1])
        self.baseline_cart_ROC_AUC = self.genericUtils.cart_with_roc(self.masterDF[self.baselineCols].loc[:len(self.origTrain)-1], self.masterDF[self.target].loc[:len(self.origTrain)-1])
    
    
    def merge_to_Master(self, master_df, new_df):
        if len(master_df) != len(new_df):
            print("The lengths of the DataFrames do not match.")
            return master_df

        # Concatenate DataFrames side by side
        combined_df = pd.concat([master_df, new_df], axis=1)

        # Remove duplicate columns, keeping only the column from master_df
        combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

        for col in new_df.columns:
            if col in master_df.columns:
                combined_df.drop(columns=[col], inplace=True)
                combined_df[col] = master_df[col]

        return combined_df
    
    
    def addNumericalBinHots(self, colName, max_n_bins, mode, n_jobs=1):
        breakpoints = self.genericUtils.train_cart_bins_with_plot(self.origTrain, colName, self.target, max_n_bins, n_jobs=1)
        if(mode=='Research'):
            print("Only for checks. Master Dataframe not changed")
            return None
        else:
            self.currentNumericBinnedDF = self.genericUtils.create_one_hot_encode_ranges(self.masterDF, colName, [self.target], breakpoints)
            self.masterDF = self.merge_to_Master(self.masterDF, self.currentNumericBinnedDF)
            self.baselineCols = self.baselineCols + self.genericUtils.difference_of_lists(list(self.currentNumericBinnedDF.columns), [self.target])
            print("Done merging with Master Dataframe!")
            return None
    
    
    def addInteractionOneHots(self, colNames, threshold, options, mode):
        wanted_subsets = self.genericUtils.train_subset_counts_oneHot(self.masterDF.loc[:8692], self.target, 1, options, n_jobs=1)
        sorted_subsets = self.genericUtils.filter_and_sort_subsets(wanted_subsets, threshold)
        if(mode=='Research'):
            print("Only for checks. Master Dataframe not changed")
            return None
        else:
            self.currentInteractionDF = self.genericUtils.create_combinedFeatures_df(df, [], sorted_subsets,  'INT')
            self.masterDF = self.merge_to_Master(self.masterDF, self.currentInteractionDF)
            self.baselineCols = self.baselineCols + self.genericUtils.difference_of_lists(list(self.currentInteractionDF.columns), [self.target])
            print("Done merging with Master Dataframe!")
            return None

        
    def saveMaster_to_Path(self, path):
        fileLoc = "/working/"+ "Master_Data.csv"
        self.masterDF.to_csv(fileLoc, index=False)
        print("Saved the master data!")
        return 
    
    
    def fitOsave_BaselineModel(self, save_path):
        fitted_models = {}
        print("Fitting Logit using StatsModels...")
        logit_model = sm.Logit(self.masterDF[self.target].loc[:len(self.origTrain)-1], 
                               sm.add_constant(self.masterDF[self.baselineCols].loc[:len(self.origTrain)-1])).fit(method='lbfgs')
        print(logit_model.summary())
        model_filename = f"{save_path}_LogitProb.joblib"
        joblib.dump(logit_model, model_filename)
        
        logit_y_train_pred = logit_model.predict(sm.add_constant(self.masterDF[self.baselineCols].loc[:len(self.origTrain)-1]))

        logit_roc_auc = roc_auc_score(self.masterDF[self.target].loc[:len(self.origTrain)-1], logit_y_train_pred)

        fitted_models[f"LogitProb_Features_{len(self.baselineCols)}"] = logit_roc_auc
        
        print("Fitting Decision Tree Classifier using Scipy...")
        cart_model = DecisionTreeClassifier(max_leaf_nodes=3*len(self.baselineCols), random_state=42)
        cart_model.fit(self.masterDF[self.baselineCols].loc[:len(self.origTrain)-1], self.masterDF[self.target].loc[:len(self.origTrain)-1])
        model_filename = f"{save_path}_DecisionTreeClassifier.joblib"
        joblib.dump(cart_model, model_filename)
        
        cart_y_train_pred = cart_model.predict_proba(self.masterDF[self.baselineCols].loc[:len(self.origTrain)-1])

        cart_roc_auc = roc_auc_score(self.masterDF[self.target].loc[:len(self.origTrain)-1], cart_y_train_pred)
        
        fitted_models[f"DecisionTreeClassifier_Features_{len(self.baselineCols)}"] = cart_roc_auc

        self.currentFittedModelsDict = fitted_models
        print("Done saving the current version of the fits! Fetched the roc_auc for each model.")
        return fitted_models
    
    
    def load_SavedModels(self, models_dict, save_path='/working/'):
        for model_name in models_dict.keys():
            try:
                # Load the fitted model
                model_filename = f"{save_path}{model_name}.joblib"
                self.loadedModels[model_name] = joblib.load(model_filename)
            except:
                pass
        return None

In [None]:
spaceTitanic = logitModelBuilder_BinaryClassification(train_data, test_data, 'Transported', [])

In [None]:
len(spaceTitanic.origTrain)

In [None]:
spaceTitanic.update_BaselineLogit()

In [None]:
spaceTitanic.addNumericalBinHots('num', 5, 'Dev', n_jobs=2)

In [None]:
spaceTitanic.addNumericalBinHots('RoomService', 3, 'Dev', n_jobs=2)

In [None]:
spaceTitanic.addNumericalBinHots('Age', 4, 'Dev', n_jobs=2)

In [None]:
spaceTitanic.addNumericalBinHots('ExpenseInShip', 3, 'Dev', n_jobs=2)

In [None]:
spaceTitanic.addNumericalBinHots('Spa', 5, 'Dev', n_jobs=2)

In [None]:
spaceTitanic.addNumericalBinHots('VRDeck', 3, 'Dev', n_jobs=2)

In [None]:
spaceTitanic.addNumericalBinHots('FoodCourt', 6, 'Dev', n_jobs=2)

In [None]:
spaceTitanic.addNumericalBinHots('ShoppingMall', 7, 'Dev', n_jobs=2)

In [None]:
spaceTitanic.update_BaselineLogit()

In [None]:
spaceTitanic.baselineCols

In [None]:
list(spaceTitanic.masterDF.columns)

In [None]:
interactions = ['HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_True',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_True',
 'SibSp_2',
 'SibSp_3',
 'SibSp_4',
 'SibSp_5',
 'SibSp_6',
 'SibSp_7',
 'SibSp_8',
 'deck_B',
 'deck_C',
 'deck_D',
 'deck_E',
 'deck_F',
 'deck_G',
 'deck_T',
 'side_S',
 'num_0.00to334.50',
 'num_334.50to602.50',
 'num_602.50to824.50',
 'num_824.50to1162.50',
 'num_1162.50to1894.00',
 'RoomService_0.00to0.50',
 'RoomService_0.50to346.50',
 'RoomService_346.50to14327.00',
 'Age_0.00to4.50',
 'Age_4.50to17.50',
 'Age_17.50to24.50',
 'Age_24.50to79.00',
 'ExpenseInShip_0.00to0.50',
 'ExpenseInShip_0.50to2384.50',
 'ExpenseInShip_2384.50to35987.00',
 'Spa_0.00to0.50',
 'Spa_0.50to266.50',
 'Spa_266.50to2446.50',
 'Spa_2446.50to2462.50',
 'Spa_2462.50to22408.00',
 'VRDeck_0.00to0.50',
 'VRDeck_0.50to613.50',
 'VRDeck_613.50to24133.00',
 'FoodCourt_0.00to0.50',
 'FoodCourt_0.50to60.50',
 'FoodCourt_60.50to668.50',
 'FoodCourt_668.50to2507.50',
 'FoodCourt_2507.50to4071.00',
 'FoodCourt_4071.00to29813.00',
 'ShoppingMall_0.00to0.50',
 'ShoppingMall_0.50to130.50',
 'ShoppingMall_130.50to303.00',
 'ShoppingMall_303.00to627.50',
 'ShoppingMall_627.50to1248.50',
 'ShoppingMall_1248.50to1823.00',
 'ShoppingMall_1823.00to23492.00']

In [None]:
spaceTitanic.addInteractionOneHots(interactions, 100, 'COUNT', 'Research')

In [None]:
spaceTitanic.baselineCols

In [None]:
logit_model = sm.Logit(spaceTitanic.masterDF[spaceTitanic.target].loc[:8692], sm.add_constant(spaceTitanic.masterDF[spaceTitanic.baselineCols].loc[:8692])).fit(method='lbfgs')
print(logit_model.summary())
cart_model = DecisionTreeClassifier(max_leaf_nodes=180, random_state=42)
cart_model.fit(spaceTitanic.masterDF[spaceTitanic.baselineCols].loc[:8692], spaceTitanic.masterDF[spaceTitanic.target].loc[:8692])
# 3. Predict on the Test Data
y_pred = cart_model.predict(spaceTitanic.masterDF[spaceTitanic.baselineCols].loc[8693:])
# threshold = 0.39
# y_prob = logit_model.predict(sm.add_constant(spaceTitanic.masterDF[spaceTitanic.baselineCols].loc[8693:]))
# y_pred = (y_prob >= threshold).astype(bool)

In [None]:
y_pred

# Submission 🎉

In [None]:
test_id = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv").PassengerId
test_id.shape

In [None]:
submit = pd.DataFrame()
submit['PassengerId'] = test_id
submit['Transported'] = y_pred.astype(bool)

In [None]:
submit

In [None]:
submit.to_csv("submission.csv", index=False)