In [1]:
# import nltk
from nltk.stem import WordNetLemmatizer
import pandas as pd
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from keras.utils import pad_sequences
from keras.layers import Embedding,LSTM,Dense,BatchNormalization,ReLU,Softmax
from keras.models import Sequential,load_model
from keras.preprocessing.text import one_hot
import numpy as np
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint

In [2]:
# nltk.download('wordnet')

In [2]:
checkpoint = ModelCheckpoint(
    filepath='nlp_model.h5',
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1
)

In [3]:
df =pd.read_csv("D:\\Data sets\\Students anxiety and depression dataset\\dataset.csv",encoding = "ISO-8859-1")
df.columns=['text','result']
df.head(10)

Unnamed: 0,text,result
0,"trouble sleeping, confused mind, restless hear...",1.0
1,"All wrong, back off dear, forward doubt. Stay ...",1.0
2,I've shifted my focus to something else but I'...,1.0
3,"I'm restless and restless, it's been a month n...",1.0
4,"every break, you must be nervous, like somethi...",1.0
5,"I feel scared, anxious, what can I do? And may...",1.0
6,Have you ever felt nervous but didn't know why?,1.0
7,"I haven't slept well for 2 days, it's like I'm...",1.0
8,"I'm really worried, I want to cry.",1.0
9,"always restless every night, even though I don...",1.0


In [5]:
# df.info()

In [4]:
df['text']=df['text'].apply(simple_preprocess)
df.head()

Unnamed: 0,text,result
0,"[trouble, sleeping, confused, mind, restless, ...",1.0
1,"[all, wrong, back, off, dear, forward, doubt, ...",1.0
2,"[ve, shifted, my, focus, to, something, else, ...",1.0
3,"[restless, and, restless, it, been, month, now...",1.0
4,"[every, break, you, must, be, nervous, like, s...",1.0


In [5]:
lemmertizer=WordNetLemmatizer()
def lem(list):
    word_list=[lemmertizer.lemmatize(word) for word in list if word not in set(stopwords.words('english'))]
    word_list = ' '.join(word_list)
    return word_list

In [6]:
df['text']=df['text'].apply(lem)
df.head()

Unnamed: 0,text,result
0,trouble sleeping confused mind restless heart ...,1.0
1,wrong back dear forward doubt stay restless re...,1.0
2,shifted focus something else still worried,1.0
3,restless restless month boy mean,1.0
4,every break must nervous like something wrong ...,1.0


In [9]:
voc_size=5000

In [10]:
one_hot_rep= [one_hot(words,voc_size) for words in df['text']]
len(one_hot_rep)

6981

In [11]:
em_len=50
embeded_list=pad_sequences(one_hot_rep,padding='pre',maxlen=em_len)
embeded_list

array([[   0,    0,    0, ..., 2837, 1239, 3204],
       [   0,    0,    0, ..., 2837, 2837, 4089],
       [   0,    0,    0, ..., 2238, 3421, 3098],
       ...,
       [   0,    0,    0, ...,    0,    0, 3254],
       [   0,    0,    0, ..., 3603, 2938, 1195],
       [   0,    0,    0, ..., 1363,  348, 4029]])

In [12]:
embedding_vec_features=50
model=Sequential()
model.add(Embedding(voc_size,embedding_vec_features,input_length=em_len))
model.add(LSTM(128,return_sequences=True))
model.add(BatchNormalization())
model.add(LSTM(128))
model.add(BatchNormalization())
model.add(Dense(64))
model.add(BatchNormalization())
model.add(ReLU())
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            250000    
                                                                 
 lstm (LSTM)                 (None, 50, 128)           91648     
                                                                 
 batch_normalization (BatchN  (None, 50, 128)          512       
 ormalization)                                                   
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 batch_normalization_1 (Batc  (None, 128)              512       
 hNormalization)                                                 
                                                                 
 dense (Dense)               (None, 64)                8

In [13]:
X=np.array(embeded_list)
y=df['result'].fillna(0).apply(int)
X.shape

(6981, 50)

In [14]:
y.shape

(6981,)

In [15]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [16]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=30,batch_size=32,callbacks=[checkpoint])

Epoch 1/40
Epoch 1: val_accuracy improved from -inf to 0.88618, saving model to nlp_model.h5
Epoch 2/40
Epoch 2: val_accuracy did not improve from 0.88618
Epoch 3/40
Epoch 3: val_accuracy improved from 0.88618 to 0.91696, saving model to nlp_model.h5
Epoch 4/40
Epoch 4: val_accuracy improved from 0.91696 to 0.94345, saving model to nlp_model.h5
Epoch 5/40
Epoch 5: val_accuracy improved from 0.94345 to 0.98282, saving model to nlp_model.h5
Epoch 6/40
Epoch 6: val_accuracy did not improve from 0.98282
Epoch 7/40
Epoch 7: val_accuracy improved from 0.98282 to 0.98425, saving model to nlp_model.h5
Epoch 8/40
Epoch 8: val_accuracy did not improve from 0.98425
Epoch 9/40
Epoch 9: val_accuracy did not improve from 0.98425
Epoch 10/40
Epoch 10: val_accuracy did not improve from 0.98425
Epoch 11/40
Epoch 11: val_accuracy did not improve from 0.98425
Epoch 12/40
Epoch 12: val_accuracy did not improve from 0.98425
Epoch 13/40
Epoch 13: val_accuracy did not improve from 0.98425
Epoch 14/40
Epoch 1

<keras.callbacks.History at 0x24b7cd88d30>

In [17]:
test_sent="""
I've been the most anxious person before I became the most sincere person, I've also been the most worried person before it all ended
"""

In [18]:
def preprocess_text(text):
    text=simple_preprocess(text)
    word_list=[lemmertizer.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
    words = ' '.join(word_list)
    one_hot_rep= [one_hot(words,voc_size)]
    embeded_list=pad_sequences(one_hot_rep,padding='pre',maxlen=em_len)
    return embeded_list

In [19]:
model=load_model('nlp_model.h5')
pred=model.predict(preprocess_text(test_sent))



In [20]:
if pred>0.5:
    print("has dipressed")
else:
    print('not dipressed')

has dipressed


In [21]:
print(pred)

[[0.99953616]]
