In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

# Step 2: Load the Training and Test Data
train_data = pd.read_csv("D:/IBA/ML/Competition/train.csv")
test_data = pd.read_csv("D:/IBA/ML/Competition/test.csv")

# Step 3: Handle Missing Values
columns_with_missing = train_data.columns[train_data.isnull().mean() > 0.3].tolist()

for col in columns_with_missing:
    train_data[col] = train_data.groupby('hospital_id')[col].transform(lambda x: x.fillna(x.mean()))
    test_data[col] = test_data.groupby('hospital_id')[col].transform(lambda x: x.fillna(x.mean()))

# Step 4: Split Data into Features and Target
X = train_data.drop(columns=["hospital_death"])
y = train_data["hospital_death"]

# Step 5: One-Hot Encoding for Categorical Features
categorical_features = ["ethnicity", "gender", "icu_admit_source", "icu_stay_type", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"]
encoder = OneHotEncoder(drop='first', sparse=False)

X_encoded = encoder.fit_transform(X[categorical_features])
feature_names = encoder.get_feature_names_out(input_features=categorical_features)
X_categorical = pd.DataFrame(X_encoded, columns=feature_names)
X.drop(columns=categorical_features, inplace=True)
X = pd.concat([X, X_categorical], axis=1)

# Step 6: Scale the Features
numerical_features = [col for col in X.columns if col not in categorical_features]
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Step 7: Impute Missing Values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Step 8: Train the K-Nearest Neighbors Model
knn_model = KNeighborsClassifier(n_neighbors=21)
knn_model.fit(X, y)

# Step 9: Preprocess the Test Data
# Repeat the same preprocessing steps for the test data
test_encoded = encoder.transform(test_data[categorical_features])
test_categorical = pd.DataFrame(test_encoded, columns=feature_names)
test_data.drop(columns=categorical_features, inplace=True)
test_data = pd.concat([test_data, test_categorical], axis=1)
test_data[numerical_features] = scaler.transform(test_data[numerical_features])
test_data = imputer.transform(test_data)

# Step 10: Make Predictions on Test Data
test_predictions = knn_model.predict_proba(test_data)[:, 1]

# Step 11: Create Submission File
submission = pd.DataFrame({"RecordID": test_data["RecordID"].astype(int).values, "hospital_death": test_predictions})
submission.to_csv("submission_knn.csv", index=False)





IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [15]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer

# Load training and test data
train_data = pd.read_csv("D:/IBA/ML/Competition/train.csv")  # Replace "train.csv" with your training file path
test_data = pd.read_csv("D:/IBA/ML/Competition/test.csv")    # Replace "test.csv" with your test file path

# Data preprocessing
# Separate categorical and numerical features
categorical_features = ["ethnicity", "gender", "icu_admit_source", "icu_stay_type", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"]
numerical_features = [col for col in train_data.columns if col not in ["RecordID", "hospital_death"] + categorical_features]

# One-hot encoding for categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
X_categorical = encoder.fit_transform(train_data[categorical_features])
test_data_categorical = encoder.transform(test_data[categorical_features])

# Standardize numerical features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(train_data[numerical_features])
test_data_numerical = scaler.transform(test_data[numerical_features])

# Impute missing values in both training and test data
imputer = SimpleImputer(strategy="mean")
X_numerical_imputed = imputer.fit_transform(X_numerical)
test_data_numerical_imputed = imputer.transform(test_data_numerical)

# Create feature names for one-hot encoded columns
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# Combine one-hot encoded categorical and standardized numerical features
X = pd.DataFrame(X_categorical, columns=encoded_feature_names)
X[numerical_features] = pd.DataFrame(X_numerical_imputed, columns=numerical_features)

test_data_processed = pd.DataFrame(test_data_categorical, columns=encoded_feature_names)
test_data_processed[numerical_features] = pd.DataFrame(test_data_numerical_imputed, columns=numerical_features)

# Split the training data into features and target variable
y = train_data["hospital_death"]

# Train K-Nearest Neighbor with 18 neighbors originally on the preprocessed data
knn_model = KNeighborsClassifier(n_neighbors=200)
knn_model.fit(X, y)

# Make predictions using the trained model
knn_predictions = knn_model.predict_proba(test_data_processed)

# Create submission DataFrames
submission_knn = pd.DataFrame({"RecordID": test_data["RecordID"], "hospital_death": knn_predictions[:, 1]})

# Save submission file to CSV
submission_knn.to_csv("submission_knn6 - 10.csv", index=False)




In [16]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer  # Import KNNImputer

# Load training and test data
train_data = pd.read_csv("D:/IBA/ML/Competition/train.csv")  # Replace "train.csv" with your training file path
test_data = pd.read_csv("D:/IBA/ML/Competition/test.csv")    # Replace "test.csv" with your test file path

# Data preprocessing
# Separate categorical and numerical features
categorical_features = ["ethnicity", "gender", "icu_admit_source", "icu_stay_type", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"]
numerical_features = [col for col in train_data.columns if col not in ["RecordID", "hospital_death"] + categorical_features]

# One-hot encoding for categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
X_categorical = encoder.fit_transform(train_data[categorical_features])
test_data_categorical = encoder.transform(test_data[categorical_features])

# Standardize numerical features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(train_data[numerical_features])
test_data_numerical = scaler.transform(test_data[numerical_features])

# KNN impute missing values in both training and test data
knn_imputer = KNNImputer(n_neighbors=5)  # Set the number of neighbors
X_numerical_imputed = knn_imputer.fit_transform(X_numerical)
test_data_numerical_imputed = knn_imputer.transform(test_data_numerical)

# Create feature names for one-hot encoded columns
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# Combine one-hot encoded categorical and standardized numerical features
X = pd.DataFrame(X_categorical, columns=encoded_feature_names)
X[numerical_features] = pd.DataFrame(X_numerical_imputed, columns=numerical_features)

test_data_processed = pd.DataFrame(test_data_categorical, columns=encoded_feature_names)
test_data_processed[numerical_features] = pd.DataFrame(test_data_numerical_imputed, columns=numerical_features)

# Split the training data into features and target variable
y = train_data["hospital_death"]

# Train K-Nearest Neighbor with 200 neighbors on the preprocessed data
knn_model = KNeighborsClassifier(n_neighbors=200)
knn_model.fit(X, y)

# Make predictions using the trained model
knn_predictions = knn_model.predict_proba(test_data_processed)

# Create submission DataFrames
submission_knn = pd.DataFrame({"RecordID": test_data["RecordID"], "hospital_death": knn_predictions[:, 1]})

# Save submission file to CSV
submission_knn.to_csv("submission_knn7 - 11.csv", index=False)




In [19]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler  # Import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, chi2

# Load training and test data
train_data = pd.read_csv("D:/IBA/ML/Competition/train.csv")  # Replace with your training file path
test_data = pd.read_csv("D:/IBA/ML/Competition/test.csv")    # Replace with your test file path

# Data preprocessing
# Separate categorical and numerical features
categorical_features = ["ethnicity", "gender", "icu_admit_source", "icu_stay_type", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"]
numerical_features = [col for col in train_data.columns if col not in ["RecordID", "hospital_death"] + categorical_features]

# One-hot encoding for categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
X_categorical = encoder.fit_transform(train_data[categorical_features])
test_data_categorical = encoder.transform(test_data[categorical_features])

# Standardize numerical features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(train_data[numerical_features])
test_data_numerical = scaler.transform(test_data[numerical_features])

# KNN impute missing values in both training and test data
knn_imputer = KNNImputer(n_neighbors=5)
X_numerical_imputed = knn_imputer.fit_transform(X_numerical)
test_data_numerical_imputed = knn_imputer.transform(test_data_numerical)

# Create feature names for one-hot encoded columns
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# Combine one-hot encoded categorical and standardized numerical features
X = pd.DataFrame(X_categorical, columns=encoded_feature_names)
X[numerical_features] = pd.DataFrame(X_numerical_imputed, columns=numerical_features)

test_data_processed = pd.DataFrame(test_data_categorical, columns=encoded_feature_names)
test_data_processed[numerical_features] = pd.DataFrame(test_data_numerical_imputed, columns=numerical_features)

# Split the training data into features and target variable
y = train_data["hospital_death"]

# Apply Min-Max scaling to ensure non-negative values
min_max_scaler = MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
test_data_scaled = min_max_scaler.transform(test_data_processed)

# Feature selection using SelectKBest with chi-squared statistics
# Select the top 15 features based on statistical significance
selector = SelectKBest(chi2, k=15)
X_selected = selector.fit_transform(X_scaled, y)
test_data_selected = selector.transform(test_data_scaled)

# Train K-Nearest Neighbor with 200 neighbors on the selected features
knn_model = KNeighborsClassifier(n_neighbors=200)
knn_model.fit(X_selected, y)

# Make predictions using the trained model
knn_predictions = knn_model.predict_proba(test_data_selected)

# Create submission DataFrames
submission_knn = pd.DataFrame({"RecordID": test_data["RecordID"], "hospital_death": knn_predictions[:, 1]})

# Save submission file to CSV
submission_knn.to_csv("submission_knn8 - 12.csv", index=False)




In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Load training and test data
train_data = pd.read_csv("D:/IBA/ML/Competition/train.csv")
test_data = pd.read_csv("D:/IBA/ML/Competition/test.csv")

# Data preprocessing
categorical_features = ["ethnicity", "gender", "icu_admit_source", "icu_stay_type", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"]
numerical_features = [col for col in train_data.columns if col not in ["RecordID", "hospital_death"] + categorical_features]

# One-hot encoding for categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
X_categorical = encoder.fit_transform(train_data[categorical_features])
test_data_categorical = encoder.transform(test_data[categorical_features])

# Standardize numerical features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(train_data[numerical_features])
test_data_numerical = scaler.transform(test_data[numerical_features])

# KNN impute missing values in both training and test data
knn_imputer = KNNImputer(n_neighbors=7)
X_numerical_imputed = knn_imputer.fit_transform(X_numerical)
test_data_numerical_imputed = knn_imputer.transform(test_data_numerical)

# Combine one-hot encoded categorical and standardized numerical features
X = pd.DataFrame(X_categorical, columns=encoder.get_feature_names_out(categorical_features))
X[numerical_features] = pd.DataFrame(X_numerical_imputed, columns=numerical_features)

test_data_processed = pd.DataFrame(test_data_categorical, columns=encoder.get_feature_names_out(categorical_features))
test_data_processed[numerical_features] = pd.DataFrame(test_data_numerical_imputed, columns=numerical_features)

# Split the training data into features and target variable
y = train_data["hospital_death"]

# Use Decision Tree to obtain feature importance scores
dt_model = DecisionTreeClassifier()
dt_model.fit(X, y)

# Get feature importances
feature_importances = dt_model.feature_importances_

# Sort features by importance and select the top k features
k = 15  # Number of top features to select
selected_feature_indices = feature_importances.argsort()[-k:][::-1]

# Train KNN model on the selected features
knn_model = KNeighborsClassifier(n_neighbors=200)
knn_model.fit(X.iloc[:, selected_feature_indices], y)

# Make predictions using the trained KNN model
knn_predictions = knn_model.predict_proba(test_data_processed.iloc[:, selected_feature_indices])

# Create submission DataFrames
submission_knn = pd.DataFrame({"RecordID": test_data["RecordID"], "hospital_death": knn_predictions[:, 1]})

# Save submission file to CSV
submission_knn.to_csv("submission_knn9 - 15(200 neighbours).csv", index=False)


