In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
#printing stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to C:\Users\SUDHIR
[nltk_data]     KUTRE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#data preprocessing
mh_data=pd.read_csv('Sdata2.csv',encoding='unicode_escape')
#printing first five lines
print(mh_data.head())

       id                                               text        class  \
0  302034  I made a grave mistake I donât remember the ...  non-suicide   
1  302035  What series you like. I have watched all my fa...  non-suicide   
2  302036  Guys I did it! I lost my virginity but it wasn...  non-suicide   
3  302037  This guy like me or no? So, basically I have t...  non-suicide   
4  302040  I have no hopeMy ex boyfriend cheated on me an...      suicide   

  Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7  
0        NaN        NaN        NaN        NaN        NaN  
1        NaN        NaN        NaN        NaN        NaN  
2        NaN        NaN        NaN        NaN        NaN  
3        NaN        NaN        NaN        NaN        NaN  
4        NaN        NaN        NaN        NaN        NaN  


In [5]:
#checking for missing values
print(mh_data.isnull().sum())

id               69
text            107
class           129
Unnamed: 3    30928
Unnamed: 4    30931
Unnamed: 5    30932
Unnamed: 6    30932
Unnamed: 7    30932
dtype: int64


In [6]:
#replacing missing values with null string
mh_data=mh_data.fillna('')
print(mh_data.isnull().sum())

id            0
text          0
class         0
Unnamed: 3    0
Unnamed: 4    0
Unnamed: 5    0
Unnamed: 6    0
Unnamed: 7    0
dtype: int64


In [7]:
#separating feature and target
X=mh_data.drop(columns='class',axis=1)
Y=mh_data['class']
print(Y)

0        non-suicide
1        non-suicide
2        non-suicide
3        non-suicide
4            suicide
            ...     
30928    non-suicide
30929    non-suicide
30930    non-suicide
30931        suicide
30932    non-suicide
Name: class, Length: 30933, dtype: object


In [8]:
print(mh_data['text'])

0        I made a grave mistake I donât remember the ...
1        What series you like. I have watched all my fa...
2        Guys I did it! I lost my virginity but it wasn...
3        This guy like me or no? So, basically I have t...
4        I have no hopeMy ex boyfriend cheated on me an...
                               ...                        
30928    If you don't like rock then your not going to ...
30929    You how you can tell i have so many friends an...
30930    pee probably tastes like salty teaðð¦â¼ï...
30931    The usual stuff you find hereI'm not posting t...
30932    I still haven't beaten the first boss in Hollo...
Name: text, Length: 30933, dtype: object


In [9]:
#stemming
port_stem=PorterStemmer()
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content
mh_data['text']=mh_data['text'].apply(stemming)
print(mh_data['text'])

0        made grave mistak rememb post know someth lgbt...
1        seri like watch favorit seri multipl time wond...
2        guy lost virgin cool thought pp went soft minu...
3        guy like basic friend keep give littl hint lik...
4        hopemi ex boyfriend cheat gave genit herp hard...
                               ...                        
30928    like rock go get anyth go http musictast space...
30929    tell mani friend lone everyth depriv pre bough...
30930    pee probabl tast like salti tea someon drank p...
30931    usual stuff find herei post sympathi piti know...
30932    still beaten first boss hollow knight fought t...
Name: text, Length: 30933, dtype: object


In [10]:
#X has features and Y has labes
X=mh_data['text'].values
Y=mh_data['class'].values
print(X)
print(Y)

['made grave mistak rememb post know someth lgbt commun made comment tri say someth along line straight like thought gay person whatev hell want came across homophob lost know care whether peopl get exactli word well'
 'seri like watch favorit seri multipl time wonder guy favorit netflix mabi calm mabi watch brand new netflix seri'
 'guy lost virgin cool thought pp went soft minut fuck' ...
 'pee probabl tast like salti tea someon drank pee confirm'
 'usual stuff find herei post sympathi piti know far wors situat mine want get stuff seem life point everyth done life ruin quit isol everyon even famili even like tell famili would help consid psychot probabl right know sens fuck univers want think seem like univers fuck made know made fuck think know want get peopl tri help went rough patch got tough done tough life tough look around famili sinc youngest seen ridicul shit happen fuck post area first time felt like tri take life bitch know mean serious cruel joke play despis life want know

In [11]:
print(X.shape)

(30933,)


In [12]:
print(Y.shape)

(30933,)


In [13]:
#converting textual data to numberical data
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)
print(X)

  (0, 33346)	0.16764732931605258
  (0, 32969)	0.20013566234915206
  (0, 32927)	0.16813274028133982
  (0, 32847)	0.12860531529127242
  (0, 32561)	0.07493371584028177
  (0, 30598)	0.09616400940755265
  (0, 29776)	0.11056144127731336
  (0, 28207)	0.18697310766669029
  (0, 27382)	0.22563559355654228
  (0, 25706)	0.10499459779454338
  (0, 24693)	0.15744866718336942
  (0, 23015)	0.12062648687212743
  (0, 22282)	0.11710097501358104
  (0, 22177)	0.09234323013754775
  (0, 19071)	0.19111574650662655
  (0, 17964)	0.25139820058018997
  (0, 17695)	0.14116393706275365
  (0, 17301)	0.1947647240016349
  (0, 17256)	0.07267108614971887
  (0, 17106)	0.27394605424338037
  (0, 16550)	0.16030322715683049
  (0, 14030)	0.2550972372929163
  (0, 13456)	0.16037074750533517
  (0, 12596)	0.2712413080802721
  (0, 12046)	0.07943307302901155
  :	:
  (30931, 5587)	0.052526327876709016
  (30931, 5546)	0.06661680997928646
  (30931, 4323)	0.0562801442275847
  (30931, 3629)	0.0887660890769362
  (30931, 3211)	0.08481780824

In [14]:
#converting the labels values into numberics
#from sklearn import preprocessing
#label_encoder=preprocessing.LabelEncoder()
#mh_data['class']=label_encoder.fit_transform(mh_data['class'])
#mh_data['class'].unique()


In [21]:
#splitting dataset into training and testing
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [17]:
#importing libraries to fit the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [18]:
#training logistic regression model
model=LogisticRegression()
model.fit(X_train,Y_train)

NameError: name 'X_train' is not defined

In [None]:
#evaluating accuracy score on training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [None]:
print("Accuracy score of the training data ",training_data_accuracy)

In [None]:
# evaluating accuarcy score on the testing data
X_test_prediction=model.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [None]:
print("Accuracy score of the testing data ",testing_data_accuracy)

In [None]:
#Making a predictive system
X_new=X_test[55]
prediction=model.predict(X_new)
print(prediction)
print(Y_test[55])