Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('vader_lexicon')

In [None]:
analyzer = SentimentIntensityAnalyzer()
vader_lexicon = analyzer.lexicon

In [None]:
file_path = "train.csv"
df = pd.read_csv(file_path)

In [None]:
file_path_test = "test.csv"
df_testset = pd.read_csv(file_path_test)

In [None]:
df = df[df["Vader Sentiment"] != "Neutral"]


In [None]:
df["Vader_Binary_Sentiment"] = df["Vader Sentiment"].map({"Positive": 1, "Negative": 0})


In [None]:
df["sentence_length"] = df["cleanText"].str.split().str.len()
max_length = df["sentence_length"].max()  # To find the longest sentence 

print(f"Max length: {max_length}")

In [None]:
df_testset["sentence_length"] = df_testset["cleanText"].str.split().str.len()
max_length_test = df_testset["sentence_length"].max()  # To find the longest sentence 

print(f"Max length: {max_length_test}")

In [None]:
def sentence_to_vader_scores(sentence):
    words = sentence.split()
    scores = [vader_lexicon.get(word.lower(), 0) for word in words]  # Convert words to lowercase
    
    # Padding to check that all arrays have the same length
    if len(scores) < max_length:
        scores.extend([0] * (max_length - len(scores)))  # Padding
    else:
        scores = scores[:max_length]  
    
    return scores

In [None]:
df["sentiment_vector"] = df["cleanText"].apply(sentence_to_vader_scores)
df_testset["sentiment_vector"] = df_testset["cleanText"].apply(sentence_to_vader_scores)



In [None]:
df[["desc_id", "sentiment_vector","Vader_Binary_Sentiment"]].to_csv("processed_train.csv",index=False)
df_testset[["desc_id", "sentiment_vector"]].to_csv("processed_test.csv", index=False)


Load the processed dataset


In [None]:

file_path = "processed_train.csv"
df_processed = pd.read_csv(file_path)
df_test=pd.read_csv("processed_test.csv")

Function to clean and convert sentiment_vector to a list of floats


In [None]:
def clean_sentiment_array(array_str):
    # Clean and split the string into numbers
    array_str = array_str.replace(
        "\n", " ").replace(",", " ").strip("[], ")
    array_list = [float(num) for num in array_str.split() if num]

    # Array should be exactly 236
    array_list.extend([0] * (236 - len(array_list)))
    return np.array(array_list[:236], dtype=float)

Apply cleaning function

In [None]:
df_processed["sentiment_vector"] = df_processed["sentiment_vector"].apply(clean_sentiment_array)

df_test["sentiment_vector"] = df_test["sentiment_vector"].apply(clean_sentiment_array)

 Extract features (X) and labels (y)

In [None]:
X = np.vstack(df_processed["sentiment_vector"].values)
y = df_processed["Vader_Binary_Sentiment"].values

Split the data into training and validation sets (80%-20%)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


 Feature scaling (Standardization)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# ---------------------------
# Logistic Regression Model
# ---------------------------

Initialize parameters

In [None]:
#Initialize Weights with the He Initialization
def initialize_weights_he(n_features):
    weights = np.random.randn(n_features) * np.sqrt(2.0 / n_features)
    bias = 0
    return weights, bias

In [None]:
#Activation Function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
def compute_loss(y_true, y_pred, weights, lamb):
    m = len(y_true)
    cross_entropy_loss = -(1/m) * np.sum(y_true * np.log(y_pred + 1e-9) + (1 - y_true) * np.log(1 - y_pred + 1e-9))
    l2_loss = (lamb / (2 * m)) * np.sum(weights**2)  # L2 regularization
    return cross_entropy_loss + l2_loss

In [None]:
def adam_optimizer(X, y, weights, bias, learning_rate, epochs, lamb, batch_size, beta1=0.9, beta2=0.999, epsilon=1e-8, early_stopping_patience=5):
    m = X.shape[0]
    best_loss = float('inf')
    patience_counter = 0
    val_losses = []
    val_accuracies = []
    
    # Adam optimization variables
    m_w, v_w = np.zeros_like(weights), np.zeros_like(weights)
    m_b, v_b = 0, 0

    for epoch in range(epochs):
        indices = np.random.permutation(m)
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        for i in range(0, m, batch_size):
            X_batch = X_shuffled[i:i + batch_size]
            y_batch = y_shuffled[i:i + batch_size]

            # Forward pass
            linear_model = np.dot(X_batch, weights) + bias
            y_pred = sigmoid(linear_model)

            #gradients Using L2 regularization
            dw = (1/batch_size) * np.dot(X_batch.T, (y_pred - y_batch)) + (lamb / batch_size) * weights
            db = (1/batch_size) * np.sum(y_pred - y_batch)

            # Adam Optimizer Updates
            m_w = beta1 * m_w + (1 - beta1) * dw
            v_w = beta2 * v_w + (1 - beta2) * (dw ** 2)


            m_b = beta1 * m_b + (1 - beta1) * db
            v_b = beta2 * v_b + (1 - beta2) * (db ** 2)

            m_w_hat = m_w / (1 - beta1 ** (epoch + 1))
            v_w_hat = v_w / (1 - beta2 ** (epoch + 1))

            m_b_hat = m_b / (1 - beta1 ** (epoch + 1))
            v_b_hat = v_b / (1 - beta2 ** (epoch + 1))

            weights -= learning_rate * m_w_hat / (np.sqrt(v_w_hat) + epsilon)
            bias -= learning_rate * m_b_hat / (np.sqrt(v_b_hat) + epsilon)

        # Validation loss and accuracy
        y_val_pred = sigmoid(np.dot(X_val_scaled, weights) + bias)
        val_loss = compute_loss(y_val, y_val_pred, weights, lamb)
        val_losses.append(val_loss)

        y_val_pred_binary = (y_val_pred >= 0.5).astype(int)
        val_accuracy = accuracy_score(y_val, y_val_pred_binary)
        val_accuracies.append(val_accuracy)

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

        # Early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                print("Early stopping")
                break

    print(f"Final Validation Accuracy: {val_accuracies[-1]:.4f}")
    return weights, bias, val_losses, val_accuracies

# ---------------------------
# Train the Logistic Regression Model
# ---------------------------

In [None]:
lamb = 0.0001
learning_rate = 0.6
batch_size = 1024
epochs = 500
early_stopping_patience = 10
weights, bias = initialize_weights_he(X_train_scaled.shape[1])
weights, bias, _, _ = adam_optimizer(X_train_scaled, y_train, weights, bias, learning_rate, epochs, lamb, batch_size, early_stopping_patience=early_stopping_patience)

Save model weights and bias in a CSV file


In [None]:
model_df = pd.DataFrame(weights, columns=["weights"])  
model_df["bias"] = bias
model_df.to_csv("model_weights_bias.csv", index=False)


In [None]:
X_test = np.vstack(df_test["sentiment_vector"].values)
X_test_scaled = scaler.transform(X_test)

y_test_pred = sigmoid(np.dot(X_test_scaled, weights) + bias)
y_test_pred_binary = (y_test_pred >= 0.5).astype(int)

submission_df = pd.DataFrame({"id": df_test.get("id", np.arange(len(y_test_pred_binary))), "Vader_Binary_Sentiment": y_test_pred_binary})
submission_df.to_csv("sub.csv", index=False)