In [1]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#loading the dataset
df = pd.read_csv('/content/Suicide_Ideation_Dataset(Twitter-based).csv')
df.head()

Unnamed: 0,Tweet,Suicide
0,making some lunch,Not Suicide post
1,@Alexia You want his money.,Not Suicide post
2,@dizzyhrvy that crap took me forever to put to...,Potential Suicide post
3,@jnaylor #kiwitweets Hey Jer! Since when did y...,Not Suicide post
4,Trying out &quot;Delicious Library 2&quot; wit...,Not Suicide post


In [3]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#','', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(tokens)

In [7]:
df['Suicide'].isna().sum()


0

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1787 entries, 0 to 1786
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Tweet    1785 non-null   object
 1   Suicide  1787 non-null   object
dtypes: object(2)
memory usage: 28.1+ KB


In [11]:
import numpy as np

# Replace any "NaN" string values with actual np.nan
df.replace("NaN", np.nan, inplace=True)

# Now drop NaN values
df.dropna(subset=['Suicide'], inplace=True)
df.dropna(subset=['Tweet'], inplace=True)

In [12]:
print(df.isna().sum())  # Should print 0 for all columns


Tweet      0
Suicide    0
dtype: int64


In [13]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
df['clean_tweet'] = df['Tweet'].astype(str).apply(clean_text)

# Encode target variable
df['Suicide'] = df['Suicide'].map({'Not Suicide post': 0, 'Potential Suicide post': 1})

In [16]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['Suicide'], test_size=0.2, random_state=42)


In [17]:
# Convert text data to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [18]:
# Train different ML models
models = {
    "Logistic Regression": LogisticRegression(),
    "Naïve Bayes": MultinomialNB(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier()
}



In [20]:
# Train Naive Bayes Model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

ValueError: Input y contains NaN.

In [21]:
# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate Model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

NotFittedError: This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [22]:
print("NaN values in y_train:", y_train.isna().sum())
print("NaN values in y_test:", y_test.isna().sum())
print(y_train.head(20))  # Check first 20 values

NaN values in y_train: 518
NaN values in y_test: 141
1725    NaN
175     0.0
887     NaN
1606    0.0
481     0.0
1085    NaN
1749    NaN
1475    0.0
880     NaN
450     0.0
198     NaN
15      0.0
265     NaN
549     NaN
681     0.0
1781    NaN
1383    0.0
1001    0.0
746     0.0
530     NaN
Name: Suicide, dtype: float64


In [23]:
# Drop NaN values from 'Suicide' column before splitting
df = df.dropna(subset=['Suicide']).reset_index(drop=True)


In [24]:
print("NaN values in y_train:", y_train.isna().sum())
print("NaN values in y_test:", y_test.isna().sum())
print(y_train.head(20))  # Check first 20 values

NaN values in y_train: 518
NaN values in y_test: 141
1725    NaN
175     0.0
887     NaN
1606    0.0
481     0.0
1085    NaN
1749    NaN
1475    0.0
880     NaN
450     0.0
198     NaN
15      0.0
265     NaN
549     NaN
681     0.0
1781    NaN
1383    0.0
1001    0.0
746     0.0
530     NaN
Name: Suicide, dtype: float64
