In [8]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
import re

# Load data
data = pd.read_csv("D:\Mini Project\minipj\data\Suicide_Detection.csv")

# Preprocess text (remove special characters, convert to lowercase)
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

data["text"] = data["text"].apply(clean_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["class"], test_size=0.2, random_state=42)

word2vec_model = KeyedVectors.load_word2vec_format("D:\Mini Project\minipj\models\GoogleNews-vectors-negative300.bin", binary=True)

# Function to convert text to Word2Vec embeddings (Gensim 4.0.0 compatible)
def text_to_word2vec(text, model):
    embeddings = []
    for word in text.split():
        if word in model.key_to_index:  # Check if word is in vocabulary
            embeddings.append(model.word_vec(word))  # Access word vector
    if embeddings:
        return sum(embeddings) / len(embeddings)  # Average embedding
    else:
        return None  # Handle out-of-vocabulary words

# Create Word2Vec embeddings for training and testing data
X_train_word2vec = [text_to_word2vec(text, word2vec_model) for text in X_train]
X_test_word2vec = [text_to_word2vec(text, word2vec_model) for text in X_test]

import numpy as np


# Reshape embeddings to 2D arrays
X_train_word2vec = np.array(X_train_word2vec).reshape(-1, 300)  # Assuming 300-dimensional embeddings
X_test_word2vec = np.array(X_test_word2vec).reshape(-1, 300)


# Instantiate CatBoost classifier with modified hyperparameters
model = CatBoostClassifier(iterations=2000,  # Increase iterations for potential better convergence
                            loss_function='Logloss',
                            learning_rate=0.03,  # Experiment with lower learning rates
                            depth=6,  # Adjust tree depth to balance complexity and overfitting
                            l2_leaf_reg=5,  # Increase regularization to prevent overfitting
                            logging_level='Verbose',
                            eval_metric='Accuracy'
                            )

# Train the model with Word2Vec embeddings
model.fit(X_train_word2vec, y_train, eval_set=(X_test_word2vec, y_test))

# Evaluate accuracy
accuracy_word2vec = model.score(X_test_word2vec, y_test)
print("Test accuracy (Word2Vec):", accuracy_word2vec)


  embeddings.append(model.word_vec(word))  # Access word vector
  X_train_word2vec = np.array(X_train_word2vec).reshape(-1, 300)  # Assuming 300-dimensional embeddings


ValueError: cannot reshape array of size 185659 into shape (300)

In [9]:
print(X_train_word2vec.shape)

AttributeError: 'list' object has no attribute 'shape'

In [18]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
import re

# Define paths and model parameters
data_path = "D:/Mini Project/minipj/data/Suicide_Detection.csv"
word2vec_path = "D:/Mini Project/minipj/models/GoogleNews-vectors-negative300.bin"
embedding_dim = 300  # Adjust if using different model

# Define text preprocessing functions
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = text.lower()
    return text



def text_to_embeddings(text, model, embedding_dim):
    embeddings = []
    for word in text.split():
        if word in model.key_to_index:
            embeddings.append(model.word_vec(word))
    if embeddings:
        return np.mean(embeddings, axis=0)  # Average word vectors
    else:
        return np.zeros(embedding_dim)  # Handle out-of-vocabulary words

# Load data and preprocess text
data = pd.read_csv(data_path)
data["text"] = data["text"].apply(clean_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["class"], test_size=0.2, random_state=42)

# Load Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Create Word2Vec embeddings for training and testing data
X_train_embeddings = np.array([text_to_embeddings(text, word2vec_model, embedding_dim) for text in X_train])
X_test_embeddings = np.array([text_to_embeddings(text, word2vec_model, embedding_dim) for text in X_test])

# Define CatBoost training and evaluation pools
train_pool = Pool(data=X_train_embeddings, label=y_train)
test_pool = Pool(data=X_test_embeddings, label=y_test)

# Define CatBoost model parameters
model_params = {
    "iterations": 10000,
    "loss_function": "Logloss",
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 5,
    "eval_metric": "Accuracy",
}

# Train CatBoost model
model = CatBoostClassifier(**model_params)
model.fit(train_pool)

evaluation_results = model.get_best_score()
best_accuracy = evaluation_results["learn"]["Accuracy"]
print(f"Best Test Accuracy: {best_accuracy:.4f}")

# Evaluate performance metrics
print(model.score(test_pool))

  embeddings.append(model.word_vec(word))


0:	learn: 0.8066455	total: 121ms	remaining: 20m 13s
1:	learn: 0.8136476	total: 209ms	remaining: 17m 23s
2:	learn: 0.8208167	total: 287ms	remaining: 15m 57s
3:	learn: 0.8217215	total: 368ms	remaining: 15m 19s
4:	learn: 0.8275548	total: 450ms	remaining: 14m 59s
5:	learn: 0.8291114	total: 531ms	remaining: 14m 44s
6:	learn: 0.8334312	total: 623ms	remaining: 14m 49s
7:	learn: 0.8376109	total: 700ms	remaining: 14m 33s
8:	learn: 0.8411550	total: 774ms	remaining: 14m 18s
9:	learn: 0.8409827	total: 852ms	remaining: 14m 10s
10:	learn: 0.8443060	total: 931ms	remaining: 14m 5s
11:	learn: 0.8463258	total: 1.01s	remaining: 14m
12:	learn: 0.8481086	total: 1.1s	remaining: 14m 2s
13:	learn: 0.8503547	total: 1.17s	remaining: 13m 56s
14:	learn: 0.8516258	total: 1.25s	remaining: 13m 48s
15:	learn: 0.8525846	total: 1.32s	remaining: 13m 45s
16:	learn: 0.8541358	total: 1.4s	remaining: 13m 41s
17:	learn: 0.8552131	total: 1.48s	remaining: 13m 41s
18:	learn: 0.8558917	total: 1.56s	remaining: 13m 42s
19:	learn: 

In [24]:
print(model.get_final_evaluation()["test"]["Accuracy"])

AttributeError: 'CatBoostClassifier' object has no attribute 'get_final_evaluation'

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_metrics(model, test_pool, labels):
    preds = model.predict(test_pool)
    # precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return recall, f1

precision, recall, f1 = evaluate_metrics(model, test_pool, y_test)
# print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


ValueError: pos_label=1 is not a valid label. It should be one of ['non-suicide', 'suicide']

In [23]:
evaluation_results = model.get_best_score()
best_accuracy = evaluation_results["learn"]["Accuracy"]
print(f"Best Test Accuracy: {best_accuracy:.4f}")


Best Test Accuracy: 0.9871


In [25]:

model.save_model("D:\Mini Project\minipj\models\catboost3.h5")  # Saves as a CatBoost model file (.cbm)


In [14]:
import pandas as pd
import catboost
from gensim.models import KeyedVectors
import numpy as np

def text_to_embeddings(text, model, embedding_dim):
    embeddings = []
    for word in text.split():
        if word in model.key_to_index:
            embeddings.append(model.word_vec(word))
    if embeddings:
        return np.mean(embeddings, axis=0)  # Average word vectors
    else:
        return np.zeros(embedding_dim)  # Handle out-of-vocabulary words
    
# Load the saved model
loaded_model = catboost.CatBoost()
loaded_model.load_model("D:\Mini Project\minipj\models\catboost3.h5")

# Load the Word2Vec model (same as used for training)
word2vec_model = KeyedVectors.load_word2vec_format("D:\Mini Project\minipj\models\GoogleNews-vectors-negative300.bin", binary=True)
embedding_dim = 300  # Adjust if using different model

# Load the Suicide and Depression Detection dataset
new_data = pd.read_csv("D:\Mini Project\minipj\data\Suicide_Detection.csv")  # Assuming it's in the same directory

# Preprocess the text data (same as done during training)
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = text.lower()
    return text

new_data["text"] = new_data["text"].apply(clean_text)

# Create Word2Vec embeddings for the new text data
new_embeddings = np.array([text_to_embeddings(text, word2vec_model, embedding_dim) for text in new_data["text"]])

# Make predictions using the loaded model
new_data = pd.DataFrame(new_embeddings, columns=loaded_model.feature_names_)
predictions = loaded_model.predict(new_data)

# If you have true labels for the new data:
if "class" in new_data.columns:
    true_labels = new_data["class"]  # Assuming the label column is "class"
    accuracy = (predictions == true_labels).mean()
    print("Accuracy on new data:", accuracy)
else:
    print("Predictions:", predictions)  # Print predictions if labels are not available



  embeddings.append(model.word_vec(word))


Predictions: [ 4.50284185 -5.41404672 -3.47752337 ... -6.36098679  3.87433641
 -1.30050485]


In [11]:
print(loaded_model.feature_names_)  # Check features expected by the model
print(new_data["text"].head())  # Inspect sample texts from your new data


['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '15

In [15]:
print(new_data.columns)

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '290', '291', '292', '293', '294', '295', '296', '297', '298', '299'],
      dtype='object', length=300)


In [19]:
true_labels = new_data[0:300]  # Assuming the label column is "class"
accuracy = (predictions == true_labels).mean()
print("Accuracy on new data:", accuracy)


  accuracy = (predictions == true_labels).mean()


ValueError: Unable to coerce to Series, length must be 300: given 232074