import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load datasets
train_df = pd.read_csv("hacktrain.csv")
test_df = pd.read_csv("hacktest.csv")

# Drop unnecessary column if present
if "Unnamed: 0" in train_df.columns:
    train_df.drop(columns=["Unnamed: 0"], inplace=True)
if "Unnamed: 0" in test_df.columns:
    test_df.drop(columns=["Unnamed: 0"], inplace=True)

# Extract features and target
X = train_df.drop(columns=["ID", "class"])
y = train_df["class"]
X_test = test_df.drop(columns=["ID"])
test_ids = test_df["ID"]

# Preprocessing pipeline
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_features)
])

# Complete pipeline with logistic regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'))
])

# Fit the model
pipeline.fit(X, y)

# Predict test data
preds = pipeline.predict(X_test)

# Create submission DataFrame
submission = pd.DataFrame({
    'ID': test_ids,
    'class': preds
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("Submission saved as submission.csv")

In [3]:
import pandas as pd
import gdown
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load datasets
train_url = 'https://drive.google.com/uc?id=1tKyMi7ERS8lkRr0BSRMYJn3tv3ryLtT3'
test_url = 'https://drive.google.com/uc?id=1Pd9a30DXPZHgSSzv2y5Hsc44A5oElxKV'

gdown.download(train_url, 'hacktrain.csv', quiet=False)
gdown.download(test_url, 'hacktest.csv', quiet=False)

train_df = pd.read_csv('hacktrain.csv')
test_df = pd.read_csv('hacktest.csv')

# Drop unnecessary column if present
if "Unnamed: 0" in train_df.columns:
    train_df.drop(columns=["Unnamed: 0"], inplace=True)
if "Unnamed: 0" in test_df.columns:
    test_df.drop(columns=["Unnamed: 0"], inplace=True)

# Extract features and target
X = train_df.drop(columns=["ID", "class"])
y = train_df["class"]
X_test = test_df.drop(columns=["ID"])
test_ids = test_df["ID"]

# Preprocessing pipeline
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_features)
])

# Complete pipeline with logistic regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'))
])

# Fit the model
pipeline.fit(X, y)

# Predict test data
preds = pipeline.predict(X_test)

# Create submission DataFrame
submission = pd.DataFrame({
    'ID': test_ids,
    'class': preds
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("Submission saved as submission.csv")

Downloading...
From: https://drive.google.com/uc?id=1tKyMi7ERS8lkRr0BSRMYJn3tv3ryLtT3
To: /content/hacktrain.csv
100%|██████████| 1.67M/1.67M [00:00<00:00, 145MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Pd9a30DXPZHgSSzv2y5Hsc44A5oElxKV
To: /content/hacktest.csv
100%|██████████| 634k/634k [00:00<00:00, 96.9MB/s]


Submission saved as submission.csv


In [5]:
import pandas as pd
import numpy as np
import gdown
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import classification_report, accuracy_score

# Load datasets
train_url = 'https://drive.google.com/uc?id=1tKyMi7ERS8lkRr0BSRMYJn3tv3ryLtT3'
test_url = 'https://drive.google.com/uc?id=1Pd9a30DXPZHgSSzv2y5Hsc44A5oElxKV'

gdown.download(train_url, 'hacktrain.csv', quiet=False)
gdown.download(test_url, 'hacktest.csv', quiet=False)

train_df = pd.read_csv('hacktrain.csv')
test_df = pd.read_csv('hacktest.csv')

# Drop unnecessary column if present
if "Unnamed: 0" in train_df.columns:
    train_df.drop(columns=["Unnamed: 0"], inplace=True)
if "Unnamed: 0" in test_df.columns:
    test_df.drop(columns=["Unnamed: 0"], inplace=True)

# Extract features and target
X = train_df.drop(columns=["ID", "class"])
y = train_df["class"]
X_test = test_df.drop(columns=["ID"])
test_ids = test_df["ID"]

print(f"Training data shape: {X.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Target classes: {y.unique()}")

# METHOD 1: Enhanced Preprocessing with Robust Scaling and Polynomial Features
print("\n=== METHOD 1: Enhanced Preprocessing ===")

numeric_features = X.columns.tolist()

# Use RobustScaler instead of StandardScaler (less sensitive to outliers)
enhanced_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # median often better than mean
        ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
        ('scaler', RobustScaler())
    ]), numeric_features)
])

enhanced_pipeline = Pipeline(steps=[
    ('preprocessor', enhanced_preprocessor),
    ('classifier', LogisticRegression(max_iter=2000, multi_class='multinomial', solver='lbfgs'))
])

enhanced_pipeline.fit(X, y)
enhanced_preds = enhanced_pipeline.predict(X_test)

# METHOD 2: Hyperparameter Tuning with GridSearchCV
print("\n=== METHOD 2: Hyperparameter Tuning ===")

basic_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_features)
])

# Define parameter grid
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__max_iter': [1000, 2000]
}

# Create pipeline for grid search
grid_pipeline = Pipeline(steps=[
    ('preprocessor', basic_preprocessor),
    ('classifier', LogisticRegression(multi_class='multinomial'))
])

# Perform grid search
grid_search = GridSearchCV(
    grid_pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

tuned_preds = grid_search.predict(X_test)

# METHOD 3: Feature Selection with Logistic Regression
print("\n=== METHOD 3: Feature Selection ===")

# Using SelectKBest for feature selection
feature_selector = SelectKBest(score_func=f_classif, k='all')  # Start with all, then optimize k

fs_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('selector', feature_selector)
    ]), numeric_features)
])

# Try different numbers of features
best_score = 0
best_k = None
k_values = [int(len(numeric_features) * ratio) for ratio in [0.5, 0.7, 0.8, 0.9, 1.0]]

for k in k_values:
    fs_preprocessor.named_transformers_['num'].named_steps['selector'].k = k

    fs_pipeline = Pipeline(steps=[
        ('preprocessor', fs_preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'))
    ])

    # Cross-validation score
    cv_scores = cross_val_score(fs_pipeline, X, y, cv=5, scoring='accuracy')
    mean_score = cv_scores.mean()

    print(f"k={k}: CV accuracy = {mean_score:.4f} (+/- {cv_scores.std() * 2:.4f})")

    if mean_score > best_score:
        best_score = mean_score
        best_k = k

print(f"Best k: {best_k}")

# Train with best k
fs_preprocessor.named_transformers_['num'].named_steps['selector'].k = best_k
fs_pipeline = Pipeline(steps=[
    ('preprocessor', fs_preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'))
])

fs_pipeline.fit(X, y)
fs_preds = fs_pipeline.predict(X_test)

# METHOD 4: Recursive Feature Elimination
print("\n=== METHOD 4: Recursive Feature Elimination ===")

rfe_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_features)
])

# First fit preprocessor to get feature names
rfe_preprocessor.fit(X)
X_preprocessed = rfe_preprocessor.transform(X)

# Apply RFE
estimator = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
rfe = RFE(estimator, n_features_to_select=best_k if best_k else len(numeric_features)//2)
rfe.fit(X_preprocessed, y)

print(f"Selected {rfe.n_features_} features out of {X_preprocessed.shape[1]}")

# Create final RFE pipeline
rfe_pipeline = Pipeline(steps=[
    ('preprocessor', rfe_preprocessor),
    ('selector', rfe),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'))
])

rfe_pipeline.fit(X, y)
rfe_preds = rfe_pipeline.predict(X_test)

# METHOD 5: Ensemble of Different Configurations
print("\n=== METHOD 5: Ensemble Approach ===")

# Combine predictions from different methods using majority voting
from scipy import stats

# Stack all predictions
all_predictions = np.column_stack([
    enhanced_preds,
    tuned_preds,
    fs_preds,
    rfe_preds
])

# Majority vote for each sample
ensemble_preds = []
for i in range(len(all_predictions)):
    mode_result = stats.mode(all_predictions[i], keepdims=True)
    ensemble_preds.append(mode_result.mode[0])

ensemble_preds = np.array(ensemble_preds)

# METHOD 6: Class Weight Balancing
print("\n=== METHOD 6: Class Weight Balancing ===")

# Check class distribution
print(f"Class distribution: {y.value_counts().to_dict()}")

balanced_pipeline = Pipeline(steps=[
    ('preprocessor', basic_preprocessor),
    ('classifier', LogisticRegression(
        max_iter=2000,
        multi_class='multinomial',
        solver='lbfgs',
        class_weight='balanced'  # Automatically balance classes
    ))
])

balanced_pipeline.fit(X, y)
balanced_preds = balanced_pipeline.predict(X_test)

# Create submissions for all methods
methods = {
    'enhanced': enhanced_preds,
    'tuned': tuned_preds,
    'feature_selected': fs_preds,
    'rfe': rfe_preds,
    'ensemble': ensemble_preds,
    'balanced': balanced_preds
}

for method_name, preds in methods.items():
    submission = pd.DataFrame({
        'ID': test_ids,
        'class': preds
    })
    submission.to_csv(f"submission_{method_name}.csv", index=False)
    print(f"Submission saved as submission_{method_name}.csv")

print("\n=== Cross-Validation Scores Comparison ===")
pipelines = {
    'Enhanced': enhanced_pipeline,
    'Feature Selected': fs_pipeline,
    'RFE': rfe_pipeline,
    'Balanced': balanced_pipeline
}

for name, pipeline in pipelines.items():
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
    print(f"{name}: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

print("\nRecommendation: Try the ensemble approach first, then compare with individual methods!")

Downloading...
From: https://drive.google.com/uc?id=1tKyMi7ERS8lkRr0BSRMYJn3tv3ryLtT3
To: /content/hacktrain.csv
100%|██████████| 1.67M/1.67M [00:00<00:00, 43.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Pd9a30DXPZHgSSzv2y5Hsc44A5oElxKV
To: /content/hacktest.csv
100%|██████████| 634k/634k [00:00<00:00, 102MB/s]


Training data shape: (8000, 27)
Test data shape: (2845, 27)
Target classes: ['water' 'forest' 'impervious' 'farm' 'grass' 'orchard']

=== METHOD 1: Enhanced Preprocessing ===





=== METHOD 2: Hyperparameter Tuning ===
Fitting 5 folds for each of 60 candidates, totalling 300 fits


200 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, 

Best parameters: {'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Best cross-validation score: 0.8555

=== METHOD 3: Feature Selection ===


AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

In [4]:
from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>