In [4]:
!pip install imbalanced-learn




In [5]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from datetime import datetime

# Function to load and process data from TXT files
def load_and_process_data(file_path, is_train=True):
    data = []
    labels = [] if is_train else None

    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Skip empty lines
                parts = line.strip().split()
                if is_train:
                    labels.append(int(parts[0]))  # Extract class label
                    features = list(map(int, parts[1:]))  # Extract feature indices
                else:
                    features = list(map(int, parts))  # Only feature indices for test data
                data.append(features)

    max_index = max(max(features) for features in data)
    binary_array = np.zeros((len(data), max_index + 1), dtype=int)

    for row_idx, indices in enumerate(data):
        binary_array[row_idx, indices] = 1

    csr_feature_matrix = csr_matrix(binary_array)
    if is_train:
        return csr_feature_matrix, np.array(labels)
    else:
        return csr_feature_matrix

# Preprocessing pipeline
pre_processing_pipeline = Pipeline([
    ('var_thresh', VarianceThreshold(threshold=0.03)),
    ('svd', TruncatedSVD(n_components=80, random_state=42)),
    ('scaler', MaxAbsScaler())
])

# List of imbalance handling techniques
resampling_techniques = {
    'RandomOversampling': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'RandomUndersampling': RandomUnderSampler(random_state=42)
}

# Hyperparameter grid for Decision Tree
param_grid = {
    'classifier__max_depth': range(2, 10),
    'classifier__min_samples_leaf': range(1, 6),
    'classifier__min_samples_split': range(2, 6)
}

# Load and preprocess training data
X, y = load_and_process_data('./data/train_data.txt', is_train=True)
X_preprocessed = pre_processing_pipeline.fit_transform(X)

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Function to train and predict with different imbalance techniques
def train_and_predict_with_resampling(technique_name, resampler):
    pipeline = Pipeline([
        ('resampler', resampler),
        ('classifier', DecisionTreeClassifier(class_weight='balanced', random_state=42))
    ])
    
    # Hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=4, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_val_pred = best_model.predict(X_val)

    # Display results
    print(f"\nUsing {technique_name}:")
    print("Best parameters:", grid_search.best_params_)
    print(f"F1-Score (Validation): {f1_score(y_val, y_val_pred):.4f}")
    print(classification_report(y_val, y_val_pred))

    # Load and preprocess the test data
    X_test = load_and_process_data('./data/test_data.txt', is_train=False)
    X_test_preprocessed = pre_processing_pipeline.transform(X_test)

    # Predict on the test data
    test_predictions = best_model.predict(X_test_preprocessed)

    # Save predictions to a TXT file
    output_file = f"./result/format_file_{technique_name}_{datetime.now().strftime('%Y%m%d-%H%M')}.txt"
    with open(output_file, 'w') as file:
        for prediction in test_predictions:
            file.write(str(prediction) + '\n')

    print(f"Predictions saved to {output_file}")

# Apply all resampling techniques and predict
for technique_name, resampler in resampling_techniques.items():
    train_and_predict_with_resampling(technique_name, resampler)


ModuleNotFoundError: No module named 'imblearn'