In [43]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [44]:
#printing stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
#data preprocessing
mh_data=pd.read_csv('suicide.csv',encoding='unicode_escape')
#printing first five lines
print(mh_data.head())

   Unnamed: 0                                               text        class
0           2  Ex Wife Threatening SuicideRecently I left my ...      suicide
1           3  Am I weird I don't get affected by compliments...  non-suicide
2           4  Finally 2020 is almost over... So I can never ...  non-suicide
3           8          i need helpjust help me im crying so hard      suicide
4           9  Iâm so lostHello, my name is Adam (16) and I...      suicide


In [46]:
#checking for missing values
print(mh_data.isnull().sum())

Unnamed: 0    0
text          0
class         0
dtype: int64


In [47]:
#replacing missing values with null string
mh_data=mh_data.fillna('')
print(mh_data.isnull().sum())

Unnamed: 0    0
text          0
class         0
dtype: int64


In [48]:
#separating feature and target
X=mh_data.drop(columns='class',axis=1)
Y=mh_data['class']
print(Y)

0           suicide
1       non-suicide
2       non-suicide
3           suicide
4           suicide
           ...     
3361    non-suicide
3362        suicide
3363        suicide
3364        suicide
3365        suicide
Name: class, Length: 3366, dtype: object


In [49]:
print(mh_data['text'])

0       Ex Wife Threatening SuicideRecently I left my ...
1       Am I weird I don't get affected by compliments...
2       Finally 2020 is almost over... So I can never ...
3               i need helpjust help me im crying so hard
4       Iâm so lostHello, my name is Adam (16) and I...
                              ...                        
3361    How do I change my reddit username? Is so crin...
3362    WelpIâve never felt so close to going thru w...
3363    i really wish to feel happy again soon, i have...
3364    Is there actually any hope?Everything seems pr...
3365    Planning on bringing a weapon to schoolJust th...
Name: text, Length: 3366, dtype: object


In [50]:
#stemming
port_stem=PorterStemmer()
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    #stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content
mh_data['text']=mh_data['text'].apply(stemming)
print(mh_data['text'])

0       ex wife threatening suiciderecently i left my ...
1       am i weird i don t get affected by compliments...
2       finally is almost over so i can never hear has...
3               i need helpjust help me im crying so hard
4       i m so losthello my name is adam and i ve been...
                              ...                        
3361      how do i change my reddit username is so cringe
3362    welpi ve never felt so close to going thru wit...
3363    i really wish to feel happy again soon i haven...
3364    is there actually any hope everything seems pr...
3365    planning on bringing a weapon to schooljust th...
Name: text, Length: 3366, dtype: object


In [51]:
#X has features and Y has labes
X=mh_data['text'].values
Y=mh_data['class'].values
print(X)
print(Y)

['ex wife threatening suiciderecently i left my wife for good because she has cheated on me twice and lied to me so much that i have decided to refuse to go back to her as of a few days ago she began threatening suicide i have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe i ll come back i know a lot of people will threaten this in order to get their way but what happens if she really does what do i do and how am i supposed to handle her death on my hands i still love my wife but i cannot deal with getting cheated on again and constantly feeling insecure i m worried today may be the day she does it and i hope so much it doesn t happen'
 'am i weird i don t get affected by compliments if it s coming from someone i know irl but i feel really good when internet strangers do it'
 'finally is almost over so i can never hear has been a bad year ever again i swear to fucking god it s so annoying'
 ...
 'i really wish to feel ha

In [52]:
print(X.shape)

(3366,)


In [53]:
print(Y.shape)

(3366,)


In [54]:
#converting textual data to numberical data
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)
print(X)

  (0, 14209)	0.09855531763979
  (0, 14138)	0.038245607740739165
  (0, 14095)	0.04938588283799427
  (0, 14085)	0.3103227913868508
  (0, 14015)	0.084105049417921
  (0, 13921)	0.058202334089033365
  (0, 13874)	0.08089270211024918
  (0, 13272)	0.09705288555116932
  (0, 12958)	0.06684312335589175
  (0, 12949)	0.17397379241312555
  (0, 12922)	0.1505978632497014
  (0, 12827)	0.24077620210884765
  (0, 12825)	0.1451979612531907
  (0, 12795)	0.034454642327197374
  (0, 12754)	0.07015021230104589
  (0, 12717)	0.06440902125485007
  (0, 12711)	0.027259889059033298
  (0, 12707)	0.033343940235258746
  (0, 12540)	0.07729325731019923
  (0, 12376)	0.08782897902672163
  (0, 12323)	0.1505978632497014
  (0, 12314)	0.06063082974421
  (0, 12076)	0.060169869537910925
  (0, 11865)	0.08593986174123383
  (0, 11680)	0.07053832063544524
  :	:
  (3365, 8580)	0.06738745598278126
  (3365, 8563)	0.15284914695273985
  (3365, 8298)	0.19985231024935204
  (3365, 8294)	0.0863397200611804
  (3365, 7803)	0.0464253457258241
  

In [55]:
#converting the labels values into numberics
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
mh_data['class']=label_encoder.fit_transform(mh_data['class'])
mh_data['class'].unique()


array([2, 1, 0])

In [56]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [57]:
model = KNeighborsClassifier()

In [58]:
model.fit(X_train, Y_train)

KNeighborsClassifier()

In [59]:
from sklearn.metrics import accuracy_score

In [60]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [61]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9453937592867756


In [62]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [63]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.8323442136498517
