In [9]:
from itertools import combinations
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE,RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix

# CSV file
file_path = 'data/iith_foml_2023_train.csv'
data = pd.read_csv(file_path)

# Assuming the target variable column is named "Target Variable (Discrete)"
X = data.drop(columns=['Target Variable (Discrete)'])
y = data['Target Variable (Discrete)']

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Function to create sub-samples with specific class distribution
def create_subsamples(X, y, num_samples_per_class=10):
    subsamples = []
    unique_classes = y.unique()

    # Classes with a single-digit number of instances
    single_digit_classes = y.value_counts()[y.value_counts() < 10].index.tolist()

    # For each combination of 1, 0, 2, 6, 5, select num_samples_per_class instances without replacement
    for class_combination in combinations([1, 0, 2, 6, 5], num_samples_per_class):
        # Include classes with a single-digit number of instances
        class_combination += tuple(single_digit_classes)

        # Select instances for the sub-sample
        sub_sample_indices = []
        for cls in class_combination:
            # If there are more instances than required, sample without replacement
            if y[y == cls].shape[0] > num_samples_per_class:
                sub_sample_indices.extend(y[y == cls].sample(num_samples_per_class, replace=False).index)
            else:
                sub_sample_indices.extend(y[y == cls].index)

        subsamples.append((X.iloc[sub_sample_indices], y.iloc[sub_sample_indices]))

    return subsamples

# Function to train a random forest classifier on each sub-sample
def train_random_forest_models(subsamples):
    models = []

    for X_sub, y_sub in subsamples:
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_sub, y_sub)
        models.append(model)

    return models

# Function to make predictions using the ensemble of models
def ensemble_predict(models, X):
    predictions = []

    for model in models:
        prediction = model.predict(X)
        predictions.append(prediction)

    # Convert predictions to integers
    predictions = np.array(predictions, dtype=int)

    # Use voting to get the final prediction
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

    return final_predictions


# Function to evaluate the ensemble on test data and write results to a CSV file
def predict_and_write_results_ensemble(models, test_data_path, output_file_path):
    # Load test data
    test_data = pd.read_csv(test_data_path)

    # Impute missing values in test data
    X_test_imputed = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)

    # Make predictions using the ensemble of models
    final_predictions = ensemble_predict(models, X_test_imputed)

    # Create a DataFrame for results with ID and Predicted Category columns
    results_df = pd.DataFrame({
        'ID': np.arange(1, len(final_predictions) + 1),  # Assuming IDs start from 1
        'Category': final_predictions
    })

    # Write the results to a CSV file
    results_df.to_csv(output_file_path, index=False)

# Create sub-samples
subsamples = create_subsamples(X_imputed, y)

# Train random forest models on sub-samples
random_forest_models = train_random_forest_models(subsamples)

# Example usage:
# Replace 'your_test_data.csv', 'output_results.csv' with your actual file paths
predict_and_write_results_ensemble(random_forest_models, 'data/iith_foml_2023_test.csv', 'output_results.csv')


ValueError: attempt to get argmax of an empty sequence