In [79]:
# Importando as bibliotecas
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM,Embedding
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import string
import emoji
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau

In [80]:
# Carregar o dataset

df=pd.read_csv('sentiment_tweets3.csv')

df.head(10)

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@SilkCharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½C possible today. Nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@daNanner Night, darlin'! Sweet dreams to you",0


In [81]:
# Verificar o tamanho do dataset
df.shape

(10314, 3)

In [82]:
# Verificar se há valores nulos
df.isna().sum()

Index                        0
message to examine           0
label (depression result)    0
dtype: int64

In [83]:
# Renomear as colunas
df.columns=['Indice','Texto','IndicadorDepressao']
df.head(10)


Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@SilkCharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½C possible today. Nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@daNanner Night, darlin'! Sweet dreams to you",0


In [84]:
# Verificar o tipo de dados de cada coluna
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Indice              10314 non-null  int64 
 1   Texto               10314 non-null  object
 2   IndicadorDepressao  10314 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 241.9+ KB


In [85]:
# Verificar a quantidade de valores únicos na coluna 'Texto'
df['Texto'].nunique()

10282

In [86]:
# Converter o texto para minúsculas
df['Texto']=df['Texto'].str.lower()
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat need to send 'em to my accountant tomo...,0
4,540,add me on myspace!!! myspace.com/lookthunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@silkcharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½c possible today. nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@dananner night, darlin'! sweet dreams to you",0


In [87]:
# Remover URLs
def remove_URL(text):
    return re.sub(r'http\S+|www\S' , '',text)

df['Texto'] = df['Texto'].apply(remove_URL)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga,0
2,220,@comeagainjen -,0
3,288,@lapcat need to send 'em to my accountant tomo...,0
4,540,add me on myspace!!! myspace.com/lookthunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@silkcharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½c possible today. nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@dananner night, darlin'! sweet dreams to you",0


In [88]:
# Remover pontuação
stri=string.punctuation
stri 

def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))

df['Texto']=df['Texto'].apply(remove_punc)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment i missssssssss him...,0
1,217,is reading manga,0
2,220,comeagainjen,0
3,288,lapcat need to send em to my accountant tomorr...,0
4,540,add me on myspace myspacecomlookthunder,0
5,624,so sleepy good times tonight though,0
6,701,silkcharm re nbn as someone already said does ...,0
7,808,23 or 24ï¿½c possible today nice,0
8,1193,nite twitterville workout in the am ciao,0
9,1324,dananner night darlin sweet dreams to you,0


In [89]:
#importa o dicionário de abreviações e gírias comuns de chat/mensagens
from chatWords import chat_words

#Substitui abreviações e gírias comuns de chat/mensagens por suas formas completas.
def chat_word(text):
    for a,b in chat_words.items():
        text=text.replace(a,b)
    return text

df['Texto']=df['Texto'].apply(chat_word)

In [90]:
#Remover stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['Texto']=df['Texto'].apply(remove_stopwords)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,real good moment missssssssss much,0
1,217,reading manga,0
2,220,comeagainjen,0
3,288,lapcat need send em accountant tomorrow oddly ...,0
4,540,add myspace myspacecomlookthunder,0
5,624,sleepy good times tonight though,0
6,701,silkcharm nbn someone already said fiber home ...,0
7,808,23 24ï¿½c possible today nice,0
8,1193,nite twitterville workout ciao,0
9,1324,dananner night darlin sweet dreams,0


In [91]:
# Remover emojis
def remove_ej(text):
    return emoji.demojize(text)

df['Texto'] = df['Texto'].apply(remove_ej)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,real good moment missssssssss much,0
1,217,reading manga,0
2,220,comeagainjen,0
3,288,lapcat need send em accountant tomorrow oddly ...,0
4,540,add myspace myspacecomlookthunder,0
5,624,sleepy good times tonight though,0
6,701,silkcharm nbn someone already said fiber home ...,0
7,808,23 24ï¿½c possible today nice,0
8,1193,nite twitterville workout ciao,0
9,1324,dananner night darlin sweet dreams,0


In [92]:
# Lematização
wordNet=WordNetLemmatizer()
def apply_lemmatization(text):
    words = text.split()
    return ' '.join([wordNet.lemmatize(word,pos='v') for word in words])

df['LematizaçãoTexto'] = df['Texto'].apply(apply_lemmatization)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao,LematizaçãoTexto
0,106,real good moment missssssssss much,0,real good moment missssssssss much
1,217,reading manga,0,read manga
2,220,comeagainjen,0,comeagainjen
3,288,lapcat need send em accountant tomorrow oddly ...,0,lapcat need send em accountant tomorrow oddly ...
4,540,add myspace myspacecomlookthunder,0,add myspace myspacecomlookthunder
5,624,sleepy good times tonight though,0,sleepy good time tonight though
6,701,silkcharm nbn someone already said fiber home ...,0,silkcharm nbn someone already say fiber home m...
7,808,23 24ï¿½c possible today nice,0,23 24ï¿½c possible today nice
8,1193,nite twitterville workout ciao,0,nite twitterville workout ciao
9,1324,dananner night darlin sweet dreams,0,dananner night darlin sweet dream


In [93]:
# Separação dos dados em treino e teste
x_train,x_test,y_train,y_test=train_test_split(df['LematizaçãoTexto'],df['IndicadorDepressao'],test_size=0.20,random_state=42)

x_train.shape,x_test.shape,y_train.shape,y_test.shape

((8251,), (2063,), (8251,), (2063,))

In [94]:
# Criar o tokenizer
tokenizer = Tokenizer(
    num_words=10000,  # Limitar o tamanho do vocabulário
    oov_token='nothing'
)

# Ajustar apenas com dados de treino
tokenizer.fit_on_texts(x_train)

# Converter textos para sequências
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

tokenizer.document_count

8251

In [95]:
# Fazer padding das sequências
maxlen = max(len(tokens) for tokens in x_train_seq)

padded_train_Seq=pad_sequences(x_train_seq,maxlen=maxlen,padding='post')

padded_test_Seq=pad_sequences(x_test_seq,maxlen=maxlen,padding='post')

print("Maximum sequence length (maxlen):", maxlen)

Maximum sequence length (maxlen): 75


In [96]:
# Slicing
padded_train_Seq[:3]

array([[  17,  436,  968, 1051,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   5, 3080,  479, 1640,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,

In [97]:
padded_test_Seq[:3]

array([[ 159,  379,  129,   51,  224,  159,  509, 3423,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [ 159,  650,  215,   25,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,

In [98]:
y_test.value_counts()

IndicadorDepressao
0    1614
1     449
Name: count, dtype: int64

In [99]:
y_train.value_counts()

IndicadorDepressao
0    6386
1    1865
Name: count, dtype: int64

In [100]:
input_Size = np.max(padded_train_Seq) + 1

input_Size

np.int32(10000)

In [101]:
# Define the model

model = Sequential()



# Use LSTM instead of SimpleRNN for better capturing long-term dependencies

model.add(LSTM(128, input_shape=(75,1), return_sequences=True))  



# Add dropout regularization

model.add(Dropout(0.5))



# Add another LSTM layer

model.add(LSTM(128)) 



# Add dropout regularization

model.add(Dropout(0.5))  



# Add a dense layer with ReLU activation

model.add(Dense(64, activation='relu'))  



# Output layer with sigmoid activation for binary classification

model.add(Dense(1, activation='sigmoid')) 

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['precision', 'recall', 'f1_score'])

  super().__init__(**kwargs)


In [102]:
model.summary()

In [103]:
ES=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

LR=ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)

In [104]:
model.fit(padded_train_Seq,y_train,validation_split=0.2, epochs=50,batch_size=32,callbacks=[ES,LR])

Epoch 1/50
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - f1_score: 0.3640 - loss: 0.5330 - precision: 0.6172 - recall: 0.1449 - val_f1_score: 0.3564 - val_loss: 0.4319 - val_precision: 0.9885 - val_recall: 0.2402 - learning_rate: 0.0010
Epoch 2/50
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 49ms/step - f1_score: 0.3625 - loss: 0.4565 - precision: 0.8433 - recall: 0.2340 - val_f1_score: 0.3564 - val_loss: 0.4493 - val_precision: 0.6637 - val_recall: 0.4134 - learning_rate: 0.0010
Epoch 3/50
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 52ms/step - f1_score: 0.3807 - loss: 0.4642 - precision: 0.8184 - recall: 0.2866 - val_f1_score: 0.3564 - val_loss: 0.3831 - val_precision: 0.8323 - val_recall: 0.3743 - learning_rate: 0.0010
Epoch 4/50
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 52ms/step - f1_score: 0.3674 - loss: 0.3618 - precision: 0.7440 - recall: 0.5030 - val_f1_score: 0.3564 - val_lo

<keras.src.callbacks.history.History at 0x19adc9a25a0>

In [105]:
_,precision,recall,_=model.evaluate(padded_test_Seq,y_test)

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - f1_score: 0.3506 - loss: 0.0532 - precision: 0.9804 - recall: 0.9553


In [106]:
precision

0.9930232763290405

In [107]:
recall

0.9510022401809692

In [108]:
from sklearn.metrics import classification_report



predicted_labels = model.predict(padded_test_Seq)

predicted_labels = (predicted_labels > 0.5).astype(int) 





report = classification_report(y_test, predicted_labels, target_names=['Negative', 'Positive'])



print(report)

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step
              precision    recall  f1-score   support

    Negative       0.99      1.00      0.99      1614
    Positive       0.99      0.95      0.97       449

    accuracy                           0.99      2063
   macro avg       0.99      0.97      0.98      2063
weighted avg       0.99      0.99      0.99      2063



In [109]:
def predict_sentiment(new_text):

    new_text = new_text.lower()  

    new_text = remove_URL(new_text)

    new_text= remove_stopwords(new_text)

    new_text= remove_ej(new_text)

    new_text= remove_punc(new_text)

    sequence = tokenizer.texts_to_sequences([new_text])

    padded_sequence = pad_sequences(sequence, maxlen=maxlen, padding='post')

    prediction = model.predict(padded_sequence)

    predicted_label = (prediction > 0.5).astype(int)

    if predicted_label == 1 :
        print('person is depressed')
    else:
        print('person is not depressed')

    return predicted_label, prediction

In [122]:
predict_sentiment("dont mistake a bad day with depression! everyone has 'em!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
person is depressed


(array([[1]]), array([[0.99995226]], dtype=float32))