In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Load data
data = pd.read_csv("D:\Mini Project\minipj\data\Suicide_Detection.csv")

# Preprocess text (remove special characters, convert to lowercase)
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

data["text"] = data["text"].apply(clean_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["class"], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer (optional for numerical representation)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# Instantiate CatBoost classifier with a tokenizer
model = CatBoostClassifier(iterations=1000,
                            loss_function='Logloss',
                            logging_level='Silent')  # Can be 'Space' or a custom tokenizer function

# Option 2: Using TF-IDF vectors (if preprocessing didn't resolve the error)
model.fit(X_train_tfidf, y_train)


# Option 2: Using TF-IDF vectors
accuracy_tfidf = model.score(X_test_tfidf, y_test)
print("Test accuracy (TF-IDF):", accuracy_tfidf)


In [None]:
model.save_model('D:\Mini Project\minipj\models\CatBoost1.h5')


In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Load data
data = pd.read_csv("D:\Mini Project\minipj\data\Suicide_Detection.csv")

# Preprocess text (remove special characters, convert to lowercase)
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

data["text"] = data["text"].apply(clean_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["class"], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer (optional for numerical representation)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# Instantiate CatBoost classifier with modified hyperparameters
model = CatBoostClassifier(iterations=1500,  # Increased iterations for potential better convergence
                            loss_function='Logloss',
                            learning_rate=0.05,  # Experiment with different learning rates
                            depth=8,  # Adjust tree depth to balance complexity and overfitting
                            l2_leaf_reg=3,  # Regularization to prevent overfitting
                            logging_level='Verbose',
                            eval_metric='Accuracy'  # Set evaluation metric explicitly
                            )

# Train the model with TF-IDF vectors
model.fit(X_train_tfidf, y_train, eval_set=(X_test_tfidf, y_test))

# Option 2: Using TF-IDF vectors
accuracy_tfidf = model.score(X_test_tfidf, y_test)
print("Test accuracy (TF-IDF):", accuracy_tfidf)

0:	learn: 0.8453293	test: 0.8427017	best: 0.8427017 (0)	total: 12.3s	remaining: 5h 8m 12s
1:	learn: 0.8481356	test: 0.8450285	best: 0.8450285 (1)	total: 22.7s	remaining: 4h 42m 49s
2:	learn: 0.8482110	test: 0.8459981	best: 0.8459981 (2)	total: 28.7s	remaining: 3h 59m 4s
3:	learn: 0.8514104	test: 0.8486265	best: 0.8486265 (3)	total: 33.9s	remaining: 3h 31m 6s
4:	learn: 0.8522452	test: 0.8505225	best: 0.8505225 (4)	total: 39.1s	remaining: 3h 15m 4s
5:	learn: 0.8538665	test: 0.8519013	best: 0.8519013 (5)	total: 44.2s	remaining: 3h 3m 20s
6:	learn: 0.8537480	test: 0.8519013	best: 0.8519013 (5)	total: 49s	remaining: 2h 54m 2s
7:	learn: 0.8549491	test: 0.8532586	best: 0.8532586 (7)	total: 53.8s	remaining: 2h 47m 7s
8:	learn: 0.8552292	test: 0.8537757	best: 0.8537757 (8)	total: 58.6s	remaining: 2h 41m 54s
9:	learn: 0.8552023	test: 0.8537326	best: 0.8537757 (8)	total: 1m 3s	remaining: 2h 37m 51s
10:	learn: 0.8564519	test: 0.8547452	best: 0.8547452 (10)	total: 1m 8s	remaining: 2h 34m 47s
11:	le

In [2]:
model.save_model('D:\Mini Project\minipj\models\CatBoost2_93.h5')


In [3]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")