In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
import xgboost as xgb

# 🔹 Load datasets
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

# 🔹 Extract features and target
X = train_df.drop(columns=["rating"])  # Assuming "rating" is the target column
y = train_df["rating"]

# 🔹 Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_df.drop(columns=["id"]))  # Drop "id" from test data

# 🔹 Train XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# 🔹 Predict on validation set
y_pred = xgb_model.predict(X_val)

# 🔹 Compute evaluation metrics
mse = mean_squared_error(y_val, y_pred)
y_pred_rounded = np.round(y_pred)  # Convert predictions to nearest integer
accuracy = accuracy_score(y_val, y_pred_rounded)

# 🔹 Compute Harmonic Score
hs = 6 * ((1/mse) * accuracy) / ((1/mse) + accuracy)

print(f"MSE: {mse}")
print(f"Accuracy: {accuracy}")
print(f"Harmonic Score: {hs}")

# 🔹 Predict on test set
test_preds = xgb_model.predict(X_test_scaled)
test_preds_rounded = np.round(test_preds)  # Ensure it's in correct format

# 🔹 Create submission.csv
submission = pd.DataFrame({"id": test_df["id"], "score": test_preds_rounded})
submission.to_csv("submission.csv", index=False)

print("✅ Submission file saved as submission.csv")


In [None]:
print(train_df.columns)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb

# 🔹 Load the dataset
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

# 🔹 Extract features and target from train data
X = train_df.drop(columns=["ID", "score"])  # ✅ Drop 'ID' column to match test set
y = train_df["score"]

# 🔹 Split into train-validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 🔹 Fix test data: Drop 'ID' before scaling
X_test_scaled = scaler.transform(test_df.drop(columns=["ID"]))  # ✅ Now matches training features

# 🔹 Train KNN Model
knn_model = KNeighborsRegressor(n_neighbors=10, weights="distance")  # Weighted KNN
knn_model.fit(X_train_scaled, y_train)

# 🔹 Predict using KNN
y_pred_knn_train = knn_model.predict(X_train_scaled)
y_pred_knn_test = knn_model.predict(X_test_scaled)

# 🔹 Train XGBoost Model
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train_scaled, y_pred_knn_train)  # 🔹 XGBoost trained on KNN predictions

# 🔹 Predict using XGBoost
y_pred_xgb_test = xgb_model.predict(X_test_scaled)

# 🔹 Create submission file
submission = pd.DataFrame({"ID": test_df["ID"], "score": y_pred_xgb_test})
submission.to_csv("submission.csv", index=False)

print("✅ Hybrid KNN + XGBoost model trained and submission.csv created successfully!")


In [None]:
import pandas as pd

# 🔹 Load the submission file
submission = pd.read_csv("submission.csv")

# 🔹 Round the 'score' column to the nearest integer
submission["score"] = submission["score"].round().astype(int)

# 🔹 Save it back to the same file
submission.to_csv("submission.csv", index=False)

print("✅ Rounded scores (integers) updated in submission.csv successfully!")


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Flatten, Dense, BatchNormalization, Dropout
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Load Data
train_data = pd.read_csv("/content/train.csv")

# Feature Engineering (Selecting Important Features)
from sklearn.feature_selection import mutual_info_regression
X_raw = train_data.drop(columns=["ID", "score"])
y = train_data["score"]

mi_scores = mutual_info_regression(X_raw, y)
important_features = X_raw.columns[np.argsort(mi_scores)[-35:]]  # Top 35 features

X = train_data[important_features]

# Train-Val Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

# Normalize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Reshape for CNN + LSTM
X_train_cnn = X_train_scaled.reshape(-1, X_train.shape[1], 1)
X_val_cnn = X_val_scaled.reshape(-1, X_train.shape[1], 1)

# ✅ Optimized CNN + LSTM Model
cnn_lstm_model = Sequential([
    Conv1D(filters=192, kernel_size=3, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001), input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    Dropout(0.3),
    LSTM(192, return_sequences=True),
    LSTM(96, return_sequences=True),
    LSTM(48),
    Dense(192, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    Dropout(0.2),
    Dense(96, activation='relu'),
    Dense(1)
])
cnn_lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003), loss="mse")

# Train CNN + LSTM with Early Stopping
cnn_lstm_model.fit(X_train_cnn, y_train, validation_data=(X_val_cnn, y_val),
                   epochs=80, batch_size=32, verbose=1,
                   callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)])

# Predict using CNN + LSTM
cnn_lstm_preds_train = cnn_lstm_model.predict(X_train_cnn).flatten()
cnn_lstm_preds_val = cnn_lstm_model.predict(X_val_cnn).flatten()

# ✅ Optimized XGBoost Model
# ✅ Optimized XGBoost Model
xgb_model = xgb.XGBRegressor(
    n_estimators=1500,
    learning_rate=0.02,
    max_depth=10,
    min_child_weight=1,
    colsample_bytree=0.9,
    subsample=0.95,
    reg_lambda=1.5,
    reg_alpha=1,
    objective="reg:squarederror",
    random_state=42
)

# ✅ Train XGBoost Model (without early stopping)
xgb_model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_val_scaled, y_val)],
    eval_metric="rmse",  # ✅ Avoids eval metric error
    verbose=True  # ✅ No more early stopping errors
)


# Predict using XGBoost
xgb_preds_train = xgb_model.predict(X_train_scaled)
xgb_preds_val = xgb_model.predict(X_val_scaled)

# ✅ Improved Stacking Model (Using RandomForest)
stacked_train = np.column_stack((cnn_lstm_preds_train, xgb_preds_train))
stacked_val = np.column_stack((cnn_lstm_preds_val, xgb_preds_val))

meta_model = RandomForestRegressor(n_estimators=500, max_depth=8, random_state=42)
meta_model.fit(stacked_train, y_train)

# Final Prediction
final_preds = meta_model.predict(stacked_val)

# ✅ **Correct Evaluation Metrics**
n = len(y_val)
mse = mean_squared_error(y_val, final_preds)
accuracy = np.mean(np.round(final_preds) == np.round(y_val))

# ✅ Improved Harmonic Score Calculation
hs = (6 * (1/mse) * accuracy) / ((1/mse) + accuracy)

print(f"🔹 Optimized MSE: {mse:.4f}, Accuracy: {accuracy:.4f}, HM Score: {hs:.4f}")

# 🔹🔹🔹 TEST SUBMISSION 🔹🔹🔹

# Load Test Data
test_data = pd.read_csv("/content/test.csv")

# Prepare Test Features
X_test = test_data[important_features]
X_test_scaled = scaler.transform(X_test)
X_test_cnn = X_test_scaled.reshape(-1, X_test.shape[1], 1)

# Predict using CNN + LSTM and XGBoost
cnn_lstm_preds_test = cnn_lstm_model.predict(X_test_cnn).flatten()
xgb_preds_test = xgb_model.predict(X_test_scaled)

# Stacked Predictions for Test Set
stacked_test = np.column_stack((cnn_lstm_preds_test, xgb_preds_test))

# Meta-Model Final Prediction
final_test_preds = meta_model.predict(stacked_test)

# ✅ Round Predictions and Save Submission
submission = pd.DataFrame({"ID": test_data["ID"], "score": np.round(final_test_preds).astype(int)})
submission.to_csv("submission_optimized.csv", index=False)

print("✅ Improved submission.csv created with optimized predictions 🎯")


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Bidirectional, Flatten, Dense, BatchNormalization, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load Data
train_data = pd.read_csv("/content/train.csv")

# Features and Target
X = train_data.drop(columns=["ID", "score"])
y = train_data["score"]

# Handle NaNs
X.fillna(X.mean(), inplace=True)
y.fillna(y.median(), inplace=True)

# Train-Val Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Reshape for CNN + LSTM
X_train_cnn = X_train_scaled.reshape(-1, X_train.shape[1], 1)
X_val_cnn = X_val_scaled.reshape(-1, X_train.shape[1], 1)

# ✅ Improved CNN + Bidirectional LSTM Model
cnn_lstm_model = Sequential([
    Conv1D(filters=256, kernel_size=3, activation='relu', kernel_regularizer=l2(0.01), input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    Dropout(0.4),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.4),
    Bidirectional(LSTM(64)),
    Dense(256, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(1)  # Regression output
])

# ✅ Fine-Tuned Learning Rate
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
cnn_lstm_model.compile(optimizer=optimizer, loss="mse")

# Callbacks
callbacks = [
    ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=5, min_lr=1e-6),
    EarlyStopping(monitor="val_loss", patience=12, restore_best_weights=True)
]

# Train CNN + LSTM
cnn_lstm_model.fit(X_train_cnn, y_train, validation_data=(X_val_cnn, y_val),
                   epochs=80, batch_size=64, verbose=1, callbacks=callbacks)

# Predictions
cnn_lstm_preds_train = cnn_lstm_model.predict(X_train_cnn).flatten()
cnn_lstm_preds_val = cnn_lstm_model.predict(X_val_cnn).flatten()

# ✅ Optimized XGBoost Model
xgb_model = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.02,
    max_depth=12,
    min_child_weight=2,
    colsample_bytree=0.9,
    subsample=0.9,
    reg_alpha=0.1,
    reg_lambda=0.5,
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

# XGBoost Predictions
xgb_preds_train = xgb_model.predict(X_train_scaled)
xgb_preds_val = xgb_model.predict(X_val_scaled)

# ✅ LightGBM Meta-Model
meta_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=6,
    num_leaves=30,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.3
)

# Stacking: CNN+LSTM + XGBoost
stacked_train = np.column_stack((cnn_lstm_preds_train, xgb_preds_train))
stacked_val = np.column_stack((cnn_lstm_preds_val, xgb_preds_val))

# Train Meta-Model
meta_model.fit(stacked_train, y_train)

# Final Predictions
final_preds = meta_model.predict(stacked_val)

# ✅ Weighted Ensemble (Stacked + Averaging)
final_preds = 0.6 * final_preds + 0.2 * cnn_lstm_preds_val + 0.2 * xgb_preds_val

# ✅ Fixed Evaluation Metrics
mse = mean_squared_error(y_val, final_preds)
exact_matches = np.sum(np.round(final_preds) == np.round(y_val))
accuracy = exact_matches / len(y_val)

# ✅ Harmonic Score Formula
hs = (6 * (1/mse) * accuracy) / ((1/mse) + accuracy)

print(f"🔹 MSE: {mse:.4f}, Accuracy: {accuracy:.4f}, Harmonic Score: {hs:.4f}")

# 🔹🔹🔹 TEST SUBMISSION 🔹🔹🔹
test_data = pd.read_csv("/content/test.csv")

# Extract ID
test_ids = test_data["ID"]

# Prepare Test Features
X_test = test_data.drop(columns=["ID"])
X_test.fillna(X_test.mean(), inplace=True)
X_test_scaled = scaler.transform(X_test)
X_test_cnn = X_test_scaled.reshape(-1, X_test.shape[1], 1)

# Predict using CNN + LSTM and XGBoost
cnn_lstm_preds_test = cnn_lstm_model.predict(X_test_cnn).flatten()
xgb_preds_test = xgb_model.predict(X_test_scaled)

# Stacked Predictions for Test Set
stacked_test = np.column_stack((cnn_lstm_preds_test, xgb_preds_test))

# Meta-Model Final Prediction
final_test_preds = meta_model.predict(stacked_test)

# Weighted Averaging for Final Test Predictions
final_test_preds = 0.6 * final_test_preds + 0.2 * cnn_lstm_preds_test + 0.2 * xgb_preds_test

# Create Submission File
submission = pd.DataFrame({"ID": test_ids, "score": final_test_preds})
submission.to_csv("submission_improved.csv", index=False)

print("✅ Submission file saved as submission_improved.csv 🎯")


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Bidirectional, Flatten, Dense, BatchNormalization, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load Data
train_data = pd.read_csv("/content/train.csv")

# Features and Target
X = train_data.drop(columns=["ID", "score"])
y = train_data["score"]

# Handle NaNs More Effectively
X.fillna(X.median(), inplace=True)
y.fillna(y.median(), inplace=True)

# Feature Selection (Keep only important features)
corr_matrix = X.corrwith(y).abs().sort_values(ascending=False)
selected_features = corr_matrix[:30].index  # Top 30 most correlated features
X = X[selected_features]

# Train-Val Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

# Normalize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Reshape for CNN + LSTM
X_train_cnn = X_train_scaled.reshape(-1, X_train.shape[1], 1)
X_val_cnn = X_val_scaled.reshape(-1, X_val.shape[1], 1)

# ✅ Optimized CNN + LSTM Model
cnn_lstm_model = Sequential([
    Conv1D(filters=256, kernel_size=3, activation='relu', kernel_regularizer=l2(0.005), input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    Dropout(0.3),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dense(128, activation='relu', kernel_regularizer=l2(0.005)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1)  # Regression output
])

# ✅ Fine-Tuned Learning Rate
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
cnn_lstm_model.compile(optimizer=optimizer, loss="mse")

# Callbacks
callbacks = [
    ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=5, min_lr=1e-6),
    EarlyStopping(monitor="val_loss", patience=12, restore_best_weights=True)
]

# Train CNN + LSTM
cnn_lstm_model.fit(X_train_cnn, y_train, validation_data=(X_val_cnn, y_val),
                   epochs=120, batch_size=128, verbose=1, callbacks=callbacks)

# Predictions
cnn_lstm_preds_train = cnn_lstm_model.predict(X_train_cnn).flatten()
cnn_lstm_preds_val = cnn_lstm_model.predict(X_val_cnn).flatten()

# ✅ Improved XGBoost Model
xgb_model = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.012,
    max_depth=12,
    min_child_weight=3,
    colsample_bytree=0.90,
    subsample=0.90,
    reg_alpha=0.2,
    reg_lambda=0.5,
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

# XGBoost Predictions
xgb_preds_train = xgb_model.predict(X_train_scaled)
xgb_preds_val = xgb_model.predict(X_val_scaled)

# ✅ Extra LightGBM Model (To Improve Accuracy)
lgb_model = lgb.LGBMRegressor(
    n_estimators=1500,
    learning_rate=0.009,
    max_depth=10,
    num_leaves=40,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.2,
    reg_lambda=0.3
)

lgb_model.fit(X_train_scaled, y_train)

# LightGBM Predictions
lgb_preds_train = lgb_model.predict(X_train_scaled)
lgb_preds_val = lgb_model.predict(X_val_scaled)

# ✅ Extra Stacking Layer (Better Ensemble)
stacked_train = np.column_stack((cnn_lstm_preds_train, xgb_preds_train, lgb_preds_train))
stacked_val = np.column_stack((cnn_lstm_preds_val, xgb_preds_val, lgb_preds_val))

# Meta-Model (Final Blend)
meta_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.006,
    max_depth=7,
    num_leaves=25
)

meta_model.fit(stacked_train, y_train)

# Final Predictions
final_preds = meta_model.predict(stacked_val)

# ✅ Optimized Weighted Ensemble (Fine-Tuned Weights)
final_preds = 0.4 * final_preds + 0.3 * cnn_lstm_preds_val + 0.2 * xgb_preds_val + 0.1 * lgb_preds_val

# ✅ Enhanced Evaluation Metrics
mse = mean_squared_error(y_val, final_preds)
exact_matches = np.sum(np.round(final_preds) == np.round(y_val))
accuracy = exact_matches / len(y_val)

# ✅ New Harmonic Score Formula (More Weight on Accuracy)
hs = (6 * (1/mse) * accuracy) / ((1/mse) + accuracy)

print(f"🔹 MSE: {mse:.4f}, Accuracy: {accuracy:.4f}, Harmonic Score: {hs:.4f}")

# 🔹🔹🔹 TEST SUBMISSION 🔹🔹🔹
test_data = pd.read_csv("/content/test.csv")

# Extract ID
test_ids = test_data["ID"]

# Prepare Test Features
X_test = test_data[selected_features]
X_test.fillna(X_test.median(), inplace=True)
X_test_scaled = scaler.transform(X_test)
X_test_cnn = X_test_scaled.reshape(-1, X_test.shape[1], 1)

# Predict using CNN + LSTM, XGBoost, LightGBM
cnn_lstm_preds_test = cnn_lstm_model.predict(X_test_cnn).flatten()
xgb_preds_test = xgb_model.predict(X_test_scaled)
lgb_preds_test = lgb_model.predict(X_test_scaled)

# Stacked Predictions for Test Set
stacked_test = np.column_stack((cnn_lstm_preds_test, xgb_preds_test, lgb_preds_test))

# Meta-Model Final Prediction
final_test_preds = meta_model.predict(stacked_test)

# Weighted Averaging for Final Test Predictions
final_test_preds = 0.4 * final_test_preds + 0.3 * cnn_lstm_preds_test + 0.2 * xgb_preds_test + 0.1 * lgb_preds_test

# Create Submission File
submission = pd.DataFrame({"ID": test_ids, "score": final_test_preds})
submission.to_csv("submission_HS40.csv", index=False)

print("✅ Submission file saved as submission_HS40.csv 🎯")


In [None]:
import pandas as pd

# 🔹 Load the submission file
submission = pd.read_csv("submission.csv")

# 🔹 Round the 'score' column to the nearest integer
submission["score"] = submission["score"].round().astype(int)

# 🔹 Save it back to the same file
submission.to_csv("submission.csv", index=False)

print("✅ Rounded scores (integers) updated in submission.csv successfully!")
