<a href="https://colab.research.google.com/github/Panda1304/Emotion_Embedding_Ratings/blob/main/Ratings_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

X_train = train_data.drop(columns=['ID', 'score']).values
y_train = train_data['score'].values
X_test = test_data.drop(columns=['ID']).values

scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# CNN Model
cnn_input = layers.Input(shape=(64,))  # 64-dimensional embeddings
x = layers.Reshape((64, 1))(cnn_input)
x = layers.Conv1D(256, 5, activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(512, 5, activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(1024, 3, activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dense(1, activation='linear')(x)

cnn_model = models.Model(inputs=cnn_input, outputs=x)
cnn_model.compile(optimizer='adam', loss='mse')

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

def train_evaluate_model(X, y, n_splits=8):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=2522)
    fold_mse_scores = []

    for train_idx, val_idx in kfold.split(X):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        X_train_fold_norm = scaler.fit_transform(X_train_fold)
        X_val_fold_norm = scaler.transform(X_val_fold)

        cnn_model.fit(X_train_fold_norm, y_train_fold, epochs=100, batch_size=72, verbose=1,
                      validation_data=(X_val_fold_norm, y_val_fold), callbacks=[early_stopping, reduce_lr])

        train_cnn_features = cnn_model.predict(X_train_fold_norm)
        val_cnn_features = cnn_model.predict(X_val_fold_norm)

        n_components = min(60, train_cnn_features.shape[1])
        pca = PCA(n_components=n_components)
        train_cnn_features_pca = pca.fit_transform(train_cnn_features)
        val_cnn_features_pca = pca.transform(val_cnn_features)

        knn = KNeighborsRegressor(n_neighbors=25190, p=2, weights='distance')
        knn.fit(train_cnn_features_pca, y_train_fold)

        y_pred_val = knn.predict(val_cnn_features_pca)
        mse = mean_squared_error(y_val_fold, y_pred_val)
        fold_mse_scores.append(mse)

    avg_mse = np.mean(fold_mse_scores)
    return avg_mse

avg_mse = train_evaluate_model(X_train, y_train, n_splits=8)
print(f"Average MSE from k-fold cross-validation: {avg_mse:.4f}")

X_train_normalized_final = scaler.fit_transform(X_train)
X_test_normalized_final = scaler.transform(X_test)

cnn_model.fit(X_train_normalized_final, y_train, epochs=10, batch_size=292, verbose=1, validation_split=0.4,
              callbacks=[early_stopping, reduce_lr])

train_cnn_features_final = cnn_model.predict(X_train_normalized_final)
test_cnn_features_final = cnn_model.predict(X_test_normalized_final)

n_components = min(60, train_cnn_features_final.shape[1])
pca = PCA(n_components=n_components)
train_cnn_features_pca_final = pca.fit_transform(train_cnn_features_final)
test_cnn_features_pca_final = pca.transform(test_cnn_features_final)

knn_final = KNeighborsRegressor(n_neighbors=30009, p=2, weights='distance')
knn_final.fit(train_cnn_features_pca_final, y_train)

final_predictions = knn_final.predict(test_cnn_features_pca_final)

submission = pd.DataFrame({
    'ID': test_data['ID'],
    'score': final_predictions
})

submission.to_csv('submission.csv', index=False)

print("Submission file created: 'submission.csv'")


Epoch 1/100
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 708ms/step - loss: 13.7913 - val_loss: 12.9426 - learning_rate: 0.0010
Epoch 2/100
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 694ms/step - loss: 0.8173 - val_loss: 0.7790 - learning_rate: 0.0010
Epoch 3/100
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 704ms/step - loss: 0.7377 - val_loss: 0.9440 - learning_rate: 0.0010
Epoch 4/100
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 709ms/step - loss: 0.6744 - val_loss: 0.7800 - learning_rate: 0.0010
Epoch 5/100
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 708ms/step - loss: 0.6400 - val_loss: 0.7585 - learning_rate: 0.0010
Epoch 6/100
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 705ms/step - loss: 0.5992 - val_loss: 0.7653 - learning_rate: 0.0010
Epoch 7/100
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 707ms/step - loss: 0.581

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
import xgboost as xgb
import joblib

In [None]:
if not tf.executing_eagerly():
    tf.compat.v1.enable_eager_execution()

In [None]:
train_data = pd.read_csv('/content/train.csv')

In [None]:
train_data.columns = train_data.columns.str.strip().str.lower()

In [None]:
X = train_data.drop(columns=['id', 'score']).values  # Features
y = train_data['score'].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)

In [None]:
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train_imputed)
X_val_normalized = scaler.transform(X_val_imputed)

In [None]:
cnn_input = layers.Input(shape=(64,))  # 64-dimensional embeddings
x = layers.Reshape((64, 1))(cnn_input)
x = layers.Conv1D(32, 3, activation='relu')(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(64, 3, activation='relu')(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Flatten()(x)
x = layers.Dense(32, activation='relu')(x)
x = layers.Dropout(0.4)(x)
x = layers.Dense(1, activation='linear')(x)

In [None]:
cnn_model = models.Model(inputs=cnn_input, outputs=x)
cnn_model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)


In [None]:
cnn_model.fit(X_train_normalized, y_train, validation_data=(X_val_normalized, y_val),
              epochs=50, batch_size=32, verbose=1, callbacks=[early_stopping])

Epoch 1/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - loss: 1.9005 - val_loss: 0.7977
Epoch 2/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 1.0841 - val_loss: 0.8596
Epoch 3/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 1.0552 - val_loss: 0.7429
Epoch 4/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - loss: 0.9444 - val_loss: 0.7934
Epoch 5/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.9155 - val_loss: 0.7530
Epoch 6/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 0.8415 - val_loss: 0.7470
Epoch 7/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - loss: 0.8245 - val_loss: 0.7299
Epoch 8/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.7925 - val_loss: 0.7222
Epoch 9/50
[1m788/788[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x7acaf31ad950>

In [None]:
train_cnn_features = cnn_model.predict(X_train_normalized)
val_cnn_features = cnn_model.predict(X_val_normalized)

[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                             n_estimators=400,
                             learning_rate=0.09,
                             max_depth=9,
                             min_child_weight=1,
                             colsample_bytree=0.8)
xgb_model.fit(train_cnn_features, y_train)


In [None]:
val_predictions = xgb_model.predict(val_cnn_features)
mse = mean_squared_error(y_val, val_predictions)
print(f"Mean Squared Error on Validation Set: {mse}")

Mean Squared Error on Validation Set: 0.7056103436331987


In [None]:
cnn_model.save('cnn_model.h5')
joblib.dump(xgb_model, 'xgb_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(imputer, 'imputer.pkl')



['imputer.pkl']

In [None]:
test_data = pd.read_csv('/content/test.csv')

In [None]:
X_test = test_data.drop(columns=['ID']).values

In [None]:
imputer = joblib.load('/content/imputer.pkl')
X_test_imputed = imputer.transform(X_test)

In [None]:
scaler = joblib.load('/content/scaler.pkl')
X_test_normalized = scaler.transform(X_test_imputed)

In [None]:
cnn_model = tf.keras.models.load_model('/content/cnn_model.h5')




In [None]:
test_cnn_features = cnn_model.predict(X_test_normalized)


[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [None]:
xgb_model = joblib.load('/content/xgb_model.pkl')


In [None]:
test_predictions = xgb_model.predict(test_cnn_features)


In [None]:
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'score': test_predictions
})

In [None]:
submission.to_csv('submission_37.csv', index=False)


In [None]:
print("Test predictions saved to 'submission.csv'")


Test predictions saved to 'submission.csv'
