In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


In [2]:
# URLs
train_url = "https://raw.githubusercontent.com/Tenacioussoul/Data-/refs/heads/main/hacktrain.csv"
test_url = "https://raw.githubusercontent.com/Tenacioussoul/Data-/refs/heads/main/hacktest.csv"

# Load CSV files
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()


Train shape: (8000, 30)
Test shape: (2845, 29)


Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,4,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,7,8,water,1136.44,,,1647.83,1935.8,,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [63]:
print("🔎 Raw class values before mapping:")
print(train['class'].unique())


🔎 Raw class values before mapping:
[]


In [65]:
print("🧹 Null class entries after mapping:", train['class'].isna().sum())


🧹 Null class entries after mapping: 0


In [66]:
print("🛑 Rows with unmapped class labels:")
print(train[train['class'].isna()])


🛑 Rows with unmapped class labels:
Empty DataFrame
Columns: [Unnamed: 0, ID, class, 20150720_N, 20150602_N, 20150517_N, 20150501_N, 20150415_N, 20150330_N, 20150314_N, 20150226_N, 20150210_N, 20150125_N, 20150109_N, 20141117_N, 20141101_N, 20141016_N, 20140930_N, 20140813_N, 20140626_N, 20140610_N, 20140525_N, 20140509_N, 20140423_N, 20140407_N, 20140322_N, 20140218_N, 20140202_N, 20140117_N, 20140101_N]
Index: []

[0 rows x 30 columns]


In [64]:
train['class'] = train['class'].astype(str).str.strip().str.lower()

label_map = {
    'water': 0,
    'forest': 1,
    'impervious': 2,
    'farm': 3,
    'grass': 4,
    'orchard': 5
}
train['class'] = train['class'].map(label_map)


In [67]:
# Standardize labels to lowercase
train['class'] = train['class'].astype(str).str.strip().str.lower()

# Label encoding dictionary
label_map = {
    'water': 0,
    'forest': 1,
    'impervious': 2,
    'farm': 3,
    'grass': 4,
    'orchard': 5
}

# Map classes
train['class'] = train['class'].map(label_map)

# Check for issues
print("Null values in class column after mapping:", train['class'].isna().sum())
print("🧾 Unique classes after mapping:", train['class'].unique())

# Drop rows with missing values
train = train.dropna()


Null values in class column after mapping: 0
🧾 Unique classes after mapping: []


In [3]:
# Drop 'seqn' if it exists
for col in ['seqn', 'SEQN']:
    if col in train.columns:
        train.drop(col, axis=1, inplace=True)
    if col in test.columns:
        test.drop(col, axis=1, inplace=True)

# Find the gender column
gender_col = None
for col in train.columns:
    if "gend" in col.lower():
        gender_col = col
        break

# Encode gender if found
if gender_col:
    train[gender_col] = train[gender_col].map({1: 0, 2: 1})
    test[gender_col] = test[gender_col].map({1: 0, 2: 1})
    print(f"✅ Encoded gender in column: {gender_col}")
else:
    print("⚠️ Gender column not found! Skipping encoding.")


⚠️ Gender column not found! Skipping encoding.


In [4]:
from sklearn.impute import SimpleImputer

# Drop unwanted columns first
cols_to_drop = ["ID", "Unnamed: 0"]
train = train.drop(columns=cols_to_drop, errors='ignore')
test = test.drop(columns=cols_to_drop, errors='ignore')

# Now select numeric columns after dropping
numeric_cols = train.select_dtypes(include=['number']).columns

# Median imputation on numeric columns
imputer = SimpleImputer(strategy="median")

train[numeric_cols] = imputer.fit_transform(train[numeric_cols])
test[numeric_cols] = imputer.transform(test[numeric_cols])

print("✅ Missing values in numeric columns handled successfully.")


✅ Missing values in numeric columns handled successfully.


In [5]:
print("🧾 Columns in your training dataset:")
print(train.columns.tolist())


🧾 Columns in your training dataset:
['class', '20150720_N', '20150602_N', '20150517_N', '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N', '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N', '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N', '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N', '20140218_N', '20140202_N', '20140117_N', '20140101_N']


In [41]:
# Check dtype of age_group
print(train["age_group"].unique())
print(train["age_group"].dtype)

# Apply string processing only if the column is of object/string type
if train["age_group"].dtype == "object":
    train["age_group"] = train["age_group"].str.strip().str.capitalize()
    train["age_group"] = train["age_group"].map({"Adult": 0, "Senior": 1})

# If it's already numeric (0/1), nothing more is needed
elif train["age_group"].dtype in ["int64", "float64"]:
    print("✅ 'age_group' is already numeric. No mapping needed.")
else:
    raise ValueError("❌ 'age_group' column is not in expected format.")


[]
int64
✅ 'age_group' is already numeric. No mapping needed.


In [None]:
from sklearn.impute import SimpleImputer

# Step 4: Separate features and target BEFORE imputation
X = train.drop("class", axis=1)
y = train["class"]

# Imputer for numeric features only
imputer = SimpleImputer(strategy="median")
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test_imputed = pd.DataFrame(imputer.transform(test), columns=test.columns)

# Reattach target
train = X_imputed
train["class"] = y

# Replace test
test = test_imputed

print("✅ Missing values handled successfully.")


In [35]:
print("Total rows in train:", train.shape[0])
print("Number of missing values in 'class':", train['class'].isna().sum())
print("Unique values in 'class':", train['class'].unique())


Total rows in train: 0
Number of missing values in 'class': 0
Unique values in 'class': []


In [43]:
print("✅ Shape of X:", X.shape)
print("✅ Shape of y:", y.shape)
print("✅ y.value_counts():")
print(y.value_counts(dropna=False))


✅ Shape of X: (0, 27)
✅ Shape of y: (0,)
✅ y.value_counts():
Series([], Name: count, dtype: int64)


In [None]:
# Reload the data fresh
train = pd.read_csv("https://raw.githubusercontent.com/Tenacioussoul/Data-/refs/heads/main/hacktrain.csv")

# Drop rows where class is NaN BEFORE imputation
train = train.dropna(subset=["class"])

# Separate features and labels
X = train.drop("class", axis=1)
y = train["class"]

# Impute only the features
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Now you're safe to split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 2: Train the model
model.fit(X_train, y_train)

# Step 3: Predict on validation set
y_pred = model.predict(X_val)

# Step 4: Evaluate the model
print("✅ Model Evaluation Results:")
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot the top 10 features
importances = model.feature_importances_
indices = np.argsort(importances)[::-1][:10]

plt.figure(figsize=(10, 6))
plt.title("Top 10 Feature Importances")
plt.bar(range(len(indices)), importances[indices], align="center")
plt.xticks(range(len(indices)), [X_train.columns[i] for i in indices], rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Drop ID and Unnamed: 0 from test before prediction (if present)
test = test.drop(columns=["ID", "Unnamed: 0"], errors='ignore')


In [None]:
cols_to_drop = ["ID", "Unnamed: 0"]
test = test.drop(columns=cols_to_drop, errors='ignore')


In [None]:
print(train.columns.tolist())


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts 'forest', 'urban' to 0, 1, ...

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Split features and labels
X = train.drop("age_group", axis=1)
y = train["age_group"]

# Train/test split (optional, for validation)
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [11]:
# Check your columns first
print(train.columns)

# Use correct column name
X = train.drop("class", axis=1)
y = train["class"]

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


Index(['class', '20150720_N', '20150602_N', '20150517_N', '20150501_N',
       '20150415_N', '20150330_N', '20150314_N', '20150226_N', '20150210_N',
       '20150125_N', '20150109_N', '20141117_N', '20141101_N', '20141016_N',
       '20140930_N', '20140813_N', '20140626_N', '20140610_N', '20140525_N',
       '20140509_N', '20140423_N', '20140407_N', '20140322_N', '20140218_N',
       '20140202_N', '20140117_N', '20140101_N'],
      dtype='object')


In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [13]:
test_predictions = model.predict(test)


In [37]:
train["age_group"] = train["age_group"].astype(str).str.strip().str.capitalize()


In [38]:
train["age_group"] = train["age_group"].map({"Adult": 0, "Senior": 1})


In [50]:
print("🔍 Columns in train.csv:")
print(train.columns.tolist())


🔍 Columns in train.csv:
['class', '20150720_N', '20150602_N', '20150517_N', '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N', '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N', '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N', '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N', '20140218_N', '20140202_N', '20140117_N', '20140101_N']


In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load
train = pd.read_csv("https://raw.githubusercontent.com/Tenacioussoul/Data-/refs/heads/main/hacktrain.csv")
test = pd.read_csv("https://raw.githubusercontent.com/Tenacioussoul/Data-/refs/heads/main/hacktest.csv")

# Drop identifier columns
train.drop(columns=["Unnamed: 0", "ID"], errors='ignore', inplace=True)
test.drop(columns=["Unnamed: 0", "ID"], errors='ignore', inplace=True)

# Clean string labels
train["class"] = train["class"].astype(str).str.strip().str.capitalize()

# Encode labels
le = LabelEncoder()
train["class_encoded"] = le.fit_transform(train["class"])

# Print mapping
print("🧭 Label Mapping:")
for i, label in enumerate(le.classes_):
    print(f"{label} → {i}")

# Prepare data
X = train.drop(columns=["class", "class_encoded"])
y = train["class_encoded"]

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test
test_predictions = model.predict(test)

# Decode predictions to original class names
predicted_labels = le.inverse_transform(test_predictions)

# Create submission
submission = pd.DataFrame({"land_cover_type": predicted_labels})
submission.to_csv("hackathon-submission.csv", index=False)

print("✅ Submission file created.")
print(submission["land_cover_type"].value_counts())


🧭 Label Mapping:
Farm → 0
Forest → 1
Grass → 2
Impervious → 3
Orchard → 4
Water → 5
✅ Submission file created.
land_cover_type
Forest        1662
Farm           579
Impervious     365
Water          120
Grass          119
Name: count, dtype: int64


In [54]:
print("📋 Columns in train:", train.columns.tolist())
print("🔢 Shape of train:", train.shape)


📋 Columns in train: ['class', '20150720_N', '20150602_N', '20150517_N', '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N', '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N', '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N', '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N', '20140218_N', '20140202_N', '20140117_N', '20140101_N', 'class_encoded']
🔢 Shape of train: (8000, 29)


In [55]:
X = train.drop(columns=["class"])
y = train["class"]


In [57]:
print(train.shape)
print(train.columns)


(8000, 29)
Index(['class', '20150720_N', '20150602_N', '20150517_N', '20150501_N',
       '20150415_N', '20150330_N', '20150314_N', '20150226_N', '20150210_N',
       '20150125_N', '20150109_N', '20141117_N', '20141101_N', '20141016_N',
       '20140930_N', '20140813_N', '20140626_N', '20140610_N', '20140525_N',
       '20140509_N', '20140423_N', '20140407_N', '20140322_N', '20140218_N',
       '20140202_N', '20140117_N', '20140101_N', 'class_encoded'],
      dtype='object')


In [60]:
import pandas as pd

# Replace these with correct filenames or URLs
train = pd.read_csv("https://raw.githubusercontent.com/Tenacioussoul/Data-/refs/heads/main/hacktrain.csv")
test = pd.read_csv("https://raw.githubusercontent.com/Tenacioussoul/Data-/refs/heads/main/hacktest.csv")

# Inspect structure
print("Train columns:", train.columns)
print("Sample of train data:")
print(train.head())


Train columns: Index(['Unnamed: 0', 'ID', 'class', '20150720_N', '20150602_N', '20150517_N',
       '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N',
       '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N',
       '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N',
       '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N',
       '20140218_N', '20140202_N', '20140117_N', '20140101_N'],
      dtype='object')
Sample of train data:
   Unnamed: 0  ID  class  20150720_N  20150602_N  20150517_N  20150501_N  \
0           0   1  water    637.5950     658.668   -1882.030    -1924.36   
1           1   2  water    634.2400     593.705   -1625.790    -1672.32   
2           3   4  water     58.0174   -1599.160         NaN    -1052.63   
3           4   5  water     72.5180         NaN     380.436    -1256.93   
4           7   8  water   1136.4400         NaN         NaN     1647.83   

   20150415_N  20150330_N  2015031

In [71]:
print("✅ Original shape:", train.shape)

# Step 1: Standardize class labels
train['class'] = train['class'].astype(str).str.strip().str.lower()
print("✅ After standardizing class labels:", train['class'].unique())

# Step 2: Mapping labels to integers
train['class'] = train['class'].map(label_map)
print("✅ After mapping:", train['class'].unique())
print("✅ Shape after mapping:", train.shape)

# Step 3: Check for unmapped
print("🧪 Null classes count:", train['class'].isna().sum())

# Step 4: Drop remaining NaNs
train = train.dropna()
print("✅ Final shape after dropna():", train.shape)


✅ Original shape: (0, 30)
✅ After standardizing class labels: []
✅ After mapping: []
✅ Shape after mapping: (0, 30)
🧪 Null classes count: 0
✅ Final shape after dropna(): (0, 30)


In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the data
train_url = "https://raw.githubusercontent.com/Tenacioussoul/Data-/refs/heads/main/hacktrain.csv"
test_url = "https://raw.githubusercontent.com/Tenacioussoul/Data-/refs/heads/main/hacktest.csv"

train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Data

Train shape: (8000, 30)
Test shape: (2845, 29)


In [84]:
import pandas as pd

df = pd.read_csv("submission.csv")

# Check shape
print("Shape:", df.shape)  # Expected: (2844, 2) or similar

# Check column names
print("Columns:", df.columns.tolist())  # Must be ['ID', 'age_group']

# Check for invalid values
print("Invalid age_group values:", df[~df['age_group'].isin([0, 1])])

# Check ID sequence
print("Missing IDs:", set(range(1, len(df)+1)) - set(df['ID']))

Shape: (2845, 2)
Columns: ['ID', 'age_group']
Invalid age_group values: Empty DataFrame
Columns: [ID, age_group]
Index: []
Missing IDs: set()


In [86]:
# Generate corrected submission file
with open('submission_fixed.csv', 'w') as f:
    # Write header
    f.write("ID,age_group\n")
    
    # Write 2844 rows of data (IDs 1 to 2844)
    for i in range(1, 2845):
        # Use your original age_group values here
        # This is a template - insert your actual predictions
        age_group = 0  # Replace with your actual prediction for ID=i
        f.write(f"{i},{age_group}\n")

Train shape: (8000, 29), Test shape: (2845, 28)

Final submission shape: (2845, 2)
Columns: ['ID', 'age_group']
ID example: 1
Prediction distribution:
age_group
0    2162
1     683
Name: count, dtype: int64

Submission file created with correct format!


In [87]:
# Read original file
with open('submission.csv', 'r') as f:
    lines = f.readlines()

# Remove last row (keep header + 2844 data rows)
corrected_lines = lines[:-1]

# Write corrected file
with open('submission_fixed.csv', 'w') as f:
    f.writelines(corrected_lines)

print("Fixed file saved as submission_fixed.csv")

Fixed file saved as submission_fixed.csv


In [88]:
# Generate corrected submission file
with open('submission_fixed.csv', 'w') as f:
    # Write header
    f.write("ID,age_group\n")
    
    # Write 2844 rows of data (IDs 1 to 2844)
    for i in range(1, 2845):
        # Use your original age_group values here
        # This is a template - insert your actual predictions
        age_group = 0  # Replace with your actual prediction for ID=i
        f.write(f"{i},{age_group}\n")

In [1]:
import pandas as pd

# Read your submission
df = pd.read_csv('submission_fixed.csv')

# Filter test IDs (2017-2328)
df_test = df[(df['ID'] >= 2017) & (df['ID'] <= 2328)]

# Verify shape (should be 312 rows)
print(f"Corrected shape: {df_test.shape}")  # (312, 2)

# Save corrected file
df_test.to_csv('corrected_submission.csv', index=False)

Corrected shape: (312, 2)
