In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from scipy import stats
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import (
    HistGradientBoostingClassifier,  # Histogram-Based Gradient Boosting Classifier
    GradientBoostingClassifier,  # Gradient Boosting Classifier
    AdaBoostClassifier,  # AdaBoost Classifier
    RandomForestClassifier,  # Random Forest Classifier
    ExtraTreesClassifier,  # Extra Trees Classifier
    VotingClassifier,  # Ensemble Voting Classifier
    StackingClassifier,  # Stacking Classifier
)
from xgboost import XGBClassifier  # XGBoost Classifier
from lightgbm import LGBMClassifier # lightgbm Classifier

# Import evaluation metrics
from sklearn.metrics import f1_score

In [None]:
train = pd.read_csv(r"/kaggle/input/playground-series-s3e22/train.csv")
test = pd.read_csv(r"/kaggle/input/playground-series-s3e22/test.csv")
origin = pd.read_csv(r"/kaggle/input/horse-survival-dataset/horse.csv")

In [None]:
train.head()

In [None]:
origin.head()

In [None]:
num_var = [column for column in train.columns if train[column].nunique() > 10]

# Create a list 'bin_var' that contains column names from 'train' where the number of unique values is exactly 2 (binary variables)
bin_var = [column for column in train.columns if train[column].nunique() == 2]

# Create a list 'cat_var' that contains specific categorical column names from 'train'
cat_var = ['temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain',
           'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces',
           'abdomen', 'abdomo_appearance', 'lesion_2', 'surgery', 'age', 'surgical_lesion', 'lesion_3', 'cp_data']

# Define the target variable, which is 'outcome'
target = 'outcome'

In [None]:
train["is_generated"] = 1

# Add a new column 'is_generated' to the 'test' DataFrame and set all values to 1
test["is_generated"] = 1

# Add a new column 'is_generated' to the 'origin' DataFrame and set all values to 0
origin["is_generated"] = 0

# Drop the 'id' column from the 'train' DataFrame
train.drop('id', axis=1, inplace=True)

# Drop the 'id' column from the 'test' DataFrame
test.drop('id', axis=1, inplace=True)

# Concatenate the 'train' and 'origin' DataFrames along rows, ignoring index, and store the result in 'train_total'
train_total = pd.concat([train, origin], ignore_index=True)

# Remove duplicate rows from the 'train_total' DataFrame, if any
train_total.drop_duplicates(inplace=True)
total = pd.concat([train_total, test], ignore_index=True)

# Print the shapes of the three DataFrames: 'train', 'test', and 'total'
print('The shape of the train data:', train.shape)
print('The shape of the test data:', test.shape)
print('The shape of the total data:', total.shape)

In [None]:
def chi_squared_test(df, input_var, target_var, significance_level=0.05):
    contingency_table = pd.crosstab(df[input_var], df[target_var])
    chi2, p, _, _ = stats.chi2_contingency(contingency_table)
    
    if p < significance_level:
        print(f'\033[32m{input_var} has a significant relationship with the target variable.\033[0m') 
    else:
        print(f'\033[31m{input_var} does not have a significant relationship with the target variable.\033[0m')  

for i in cat_var:
    chi_squared_test(train, i, target)

In [None]:
total[target] = total[target].map({'died':0,'euthanized':1,'lived':2})

In [None]:
def preprocessing(df, le_cols, ohe_cols):
    
    # Label Encoding for binary cols
    le = LabelEncoder()    
    for col in le_cols:
        df[col] = le.fit_transform(df[col])
    
    # OneHot Encoding for category cols
    df = pd.get_dummies(df, columns = ohe_cols)
    
    df["pain"] = df["pain"].replace('slight', 'moderate')
    df["peristalsis"] = df["peristalsis"].replace('distend_small', 'normal')
    df["rectal_exam_feces"] = df["rectal_exam_feces"].replace('serosanguious', 'absent')
    df["nasogastric_reflux"] = df["nasogastric_reflux"].replace('slight', 'none')
        
    df["temp_of_extremities"] = df["temp_of_extremities"].fillna("normal").map({'cold': 0, 'cool': 1, 'normal': 2, 'warm': 3})
    df["peripheral_pulse"] = df["peripheral_pulse"].fillna("normal").map({'absent': 0, 'reduced': 1, 'normal': 2, 'increased': 3})
    df["capillary_refill_time"] = df["capillary_refill_time"].fillna("3").map({'less_3_sec': 0, '3': 1, 'more_3_sec': 2})
    df["pain"] = df["pain"].fillna("depressed").map({'alert': 0, 'depressed': 1, 'moderate': 2, 'mild_pain': 3, 'severe_pain': 4, 'extreme_pain': 5})
    df["peristalsis"] = df["peristalsis"].fillna("hypomotile").map({'hypermotile': 0, 'normal': 1, 'hypomotile': 2, 'absent': 3})
    df["abdominal_distention"] = df["abdominal_distention"].fillna("none").map({'none': 0, 'slight': 1, 'moderate': 2, 'severe': 3})
    df["nasogastric_tube"] = df["nasogastric_tube"].fillna("none").map({'none': 0, 'slight': 1, 'significant': 2})
    df["nasogastric_reflux"] = df["nasogastric_reflux"].fillna("none").map({'less_1_liter': 0, 'none': 1, 'more_1_liter': 2})
    df["rectal_exam_feces"] = df["rectal_exam_feces"].fillna("absent").map({'absent': 0, 'decreased': 1, 'normal': 2, 'increased': 3})
    df["abdomen"] = df["abdomen"].fillna("distend_small").map({'normal': 0, 'other': 1, 'firm': 2,'distend_small': 3, 'distend_large': 4})
    df["abdomo_appearance"] = df["abdomo_appearance"].fillna("serosanguious").map({'clear': 0, 'cloudy': 1, 'serosanguious': 2})

    return df    

In [None]:
total = preprocessing(total, le_cols = ["surgery", "age", "surgical_lesion", "cp_data"], ohe_cols = ["mucous_membrane"])

In [None]:
def features_engineering(df):
    
    data_preprocessed = df.copy()
    
    # Imputer 
    cols_with_nan = df.drop(target,axis=1).columns[df.drop(target,axis=1).isna().any()].tolist()

    for feature in cols_with_nan:
        data_preprocessed[feature].fillna(data_preprocessed[feature].mode()[0], inplace=True)
    
    return data_preprocessed

total = features_engineering(total)

In [None]:
df_train = total[total[target].notna()]
df_test = total[total[target].isna()]
df_test.drop(target,axis=1,inplace=True)

In [None]:
full_features = df_test.columns.tolist()
bin_features = df_test.select_dtypes('bool').columns

df_train[bin_features] = df_train[bin_features].astype('int64')
df_test[bin_features] = df_test[bin_features].astype('int64')

In [None]:
df_train.head()

In [None]:
def caculate_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average = 'micro')

In [None]:
lgbm_baseline = LGBMClassifier(n_estimators=80,
                     max_depth=4,
                     random_state=42)

f1_results = pd.DataFrame(columns=['Selected_Features', 'F1'])

def evaluation(df, select_features, note):
    global f1_results
    
    X = df[select_features]
    Y = df[target]
    
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = Y.iloc[train_idx], Y.iloc[test_idx]
        lgbm_baseline.fit(X_train, y_train)
        y_hat = lgbm_baseline.predict(X_test) 
        f1 = caculate_f1(y_test, y_hat)
        f1_scores.append(f1)
    
    average_f1 = np.mean(f1_scores)
    new_row = {'Selected_Features': note, 'F1': average_f1}
    f1_results = pd.concat([f1_results, pd.DataFrame([new_row])], ignore_index=True)

    print('====================================')
    print(note)
    print("Average F1:", average_f1)
    print('====================================')
    return average_f1
evaluation(df=df_train,select_features=full_features,note='Baseline')

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) >= threshold: 
                colname = corr_matrix.columns[i]                  
                col_corr.add(colname)
    return col_corr      

corr_features = correlation(df_train, 0.35)
corr_features

In [None]:
corr_features = df_test.drop(['abdominal_distention',
 'abdomo_protein',
 'capillary_refill_time',
 'cp_data',
 'lesion_3',
 'mucous_membrane_dark_cyanotic',
 'mucous_membrane_normal_pink',
 'packed_cell_volume',
 'peripheral_pulse',
 'peristalsis',
 'rectal_exam_feces',
 'respiratory_rate',
 'surgical_lesion',
 'temp_of_extremities',
 'total_protein'],axis=1).columns.tolist()

In [None]:
evaluation(df=df_train,select_features=corr_features,note='Corr Features')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
def f_importance_plot(f_imp):
    fig = plt.figure(figsize=(12, 0.20*len(f_imp)))
    plt.title(f'Feature importances', size=16, y=1.05, 
              fontweight='bold')
    a = sns.barplot(data=f_imp, x='imp', y='feature', linestyle="-", 
                    linewidth=0.5, edgecolor="black",palette='GnBu')
    plt.xlabel('')
    plt.xticks([])
    plt.ylabel('')
    plt.yticks(size=11)
    
    for j in ['right', 'top', 'bottom']:
        a.spines[j].set_visible(False)
    for j in ['left']:
        a.spines[j].set_linewidth(0.5)
    plt.tight_layout()
    plt.show()

In [None]:
clf = LGBMClassifier(n_estimators=1000,
                     max_depth=10,
                     random_state=42)
clf.fit(df_train.drop(target,axis=1), df_train[target])

f_imp_df = pd.DataFrame({'feature': df_train.drop(target,axis=1).columns, 'imp': clf.feature_importances_})
f_imp_df.sort_values(by='imp',ascending=False,inplace=True)
f_importance_plot(f_imp_df)

In [None]:
best_feature_num = 30
best_score = 0.7392406127690802
print(f'Best feature number is Top {best_feature_num}, Best score is {best_score}')

In [None]:
best_features = f_imp_df.head(best_feature_num).feature.to_list()

In [None]:
X = df_train[best_features]
y = df_train[target]

test_df = df_test[best_features]

X

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import f1_score


hist_model = HistGradientBoostingClassifier(
    max_depth=4,           # Adjust the maximum depth of each tree
    max_iter=80,          # Adjust the number of boosting iterations
    learning_rate=0.1,     # Adjust the learning rate
    random_state=42,   
    scoring='f1_micro',          
    max_leaf_nodes = 21,
    l2_regularization = 0.1,
)

hist_model.fit(X, y)
print(f"HistGradientBoosting Model: F1 Score (Micro-Average) = {f1_score(y, hist_model.predict(X), average='micro') * 100:.2f}%")

In [None]:
sample_submission = pd.read_csv(r"/kaggle/input/playground-series-s3e22/sample_submission.csv")
submission = hist_model.predict(test_df)
sample_submission['outcome'] = submission
sample_submission

In [None]:
outcome_mapping = {0.0: 'died', 1.0: 'euthanized', 2.0: 'lived'}

# Map the values in the "outcome" column using the dictionary
sample_submission['outcome'] = sample_submission['outcome'].map(outcome_mapping)
sample_submission

In [None]:
sample_submission.to_csv('submission.csv', index=False)