In [84]:
# Importando as bibliotecas
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM,Embedding
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import string
import emoji
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
from sklearn.utils import resample

In [85]:
# Carregar o dataset

df=pd.read_csv('sentiment_tweets3.csv')

df.head(10)

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@SilkCharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½C possible today. Nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@daNanner Night, darlin'! Sweet dreams to you",0


In [86]:
# Verificar o tamanho do dataset
df.shape

(10314, 3)

In [87]:
# Verificar se há valores nulos
df.isna().sum()

Index                        0
message to examine           0
label (depression result)    0
dtype: int64

In [88]:
# Renomear as colunas
df.columns=['Indice','Texto','IndicadorDepressao']
df.head(10)


Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@SilkCharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½C possible today. Nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@daNanner Night, darlin'! Sweet dreams to you",0


In [89]:
# Separar classes
df_majority = df[df.IndicadorDepressao == 0]
df_minority = df[df.IndicadorDepressao == 1]

# Upsample da classe minoritária
df_minority_upsampled = resample(
    df_minority,
    replace=True,  # amostragem com reposição
    n_samples=len(df_majority),  # para balancear
    random_state=42
)

# Concatenar de volta
df = pd.concat([df_majority, df_minority_upsampled])

In [90]:
# Verificar o tipo de dados de cada coluna
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16000 entries, 0 to 9054
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Indice              16000 non-null  int64 
 1   Texto               16000 non-null  object
 2   IndicadorDepressao  16000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 500.0+ KB


In [91]:
# Verificar a quantidade de valores únicos na coluna 'Texto'
df['Texto'].nunique()

10209

In [92]:
# Converter o texto para minúsculas
df['Texto']=df['Texto'].str.lower()
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat need to send 'em to my accountant tomo...,0
4,540,add me on myspace!!! myspace.com/lookthunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@silkcharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½c possible today. nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@dananner night, darlin'! sweet dreams to you",0


In [93]:
# Remover URLs
def remove_URL(text):
    return re.sub(r'http\S+|www\S' , '',text)

df['Texto'] = df['Texto'].apply(remove_URL)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga,0
2,220,@comeagainjen -,0
3,288,@lapcat need to send 'em to my accountant tomo...,0
4,540,add me on myspace!!! myspace.com/lookthunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@silkcharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½c possible today. nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@dananner night, darlin'! sweet dreams to you",0


In [94]:
# Remover pontuação
stri=string.punctuation
stri 

def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))

df['Texto']=df['Texto'].apply(remove_punc)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment i missssssssss him...,0
1,217,is reading manga,0
2,220,comeagainjen,0
3,288,lapcat need to send em to my accountant tomorr...,0
4,540,add me on myspace myspacecomlookthunder,0
5,624,so sleepy good times tonight though,0
6,701,silkcharm re nbn as someone already said does ...,0
7,808,23 or 24ï¿½c possible today nice,0
8,1193,nite twitterville workout in the am ciao,0
9,1324,dananner night darlin sweet dreams to you,0


In [95]:
#importa o dicionário de abreviações e gírias comuns de chat/mensagens
from chatWords import chat_words

#Substitui abreviações e gírias comuns de chat/mensagens por suas formas completas.
def chat_word(text):
    for a,b in chat_words.items():
        text=text.replace(a,b)
    return text

df['Texto']=df['Texto'].apply(chat_word)

In [96]:
#Remover stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['Texto']=df['Texto'].apply(remove_stopwords)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,real good moment missssssssss much,0
1,217,reading manga,0
2,220,comeagainjen,0
3,288,lapcat need send em accountant tomorrow oddly ...,0
4,540,add myspace myspacecomlookthunder,0
5,624,sleepy good times tonight though,0
6,701,silkcharm nbn someone already said fiber home ...,0
7,808,23 24ï¿½c possible today nice,0
8,1193,nite twitterville workout ciao,0
9,1324,dananner night darlin sweet dreams,0


In [97]:
# Remover emojis
def remove_ej(text):
    return emoji.demojize(text)

df['Texto'] = df['Texto'].apply(remove_ej)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,real good moment missssssssss much,0
1,217,reading manga,0
2,220,comeagainjen,0
3,288,lapcat need send em accountant tomorrow oddly ...,0
4,540,add myspace myspacecomlookthunder,0
5,624,sleepy good times tonight though,0
6,701,silkcharm nbn someone already said fiber home ...,0
7,808,23 24ï¿½c possible today nice,0
8,1193,nite twitterville workout ciao,0
9,1324,dananner night darlin sweet dreams,0


In [98]:
# Lematização
wordNet=WordNetLemmatizer()
def apply_lemmatization(text):
    words = text.split()
    return ' '.join([wordNet.lemmatize(word,pos='v') for word in words])

df['LematizaçãoTexto'] = df['Texto'].apply(apply_lemmatization)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao,LematizaçãoTexto
0,106,real good moment missssssssss much,0,real good moment missssssssss much
1,217,reading manga,0,read manga
2,220,comeagainjen,0,comeagainjen
3,288,lapcat need send em accountant tomorrow oddly ...,0,lapcat need send em accountant tomorrow oddly ...
4,540,add myspace myspacecomlookthunder,0,add myspace myspacecomlookthunder
5,624,sleepy good times tonight though,0,sleepy good time tonight though
6,701,silkcharm nbn someone already said fiber home ...,0,silkcharm nbn someone already say fiber home m...
7,808,23 24ï¿½c possible today nice,0,23 24ï¿½c possible today nice
8,1193,nite twitterville workout ciao,0,nite twitterville workout ciao
9,1324,dananner night darlin sweet dreams,0,dananner night darlin sweet dream


In [99]:
# Separação dos dados em treino e teste
x_train,x_test,y_train,y_test=train_test_split(df['LematizaçãoTexto'],df['IndicadorDepressao'],test_size=0.20,random_state=42)

x_train.shape,x_test.shape,y_train.shape,y_test.shape

((12800,), (3200,), (12800,), (3200,))

In [100]:
# Criar o tokenizer
tokenizer = Tokenizer(
    num_words=10000,  # Limitar o tamanho do vocabulário
    oov_token='nothing'
)

# Ajustar apenas com dados de treino
tokenizer.fit_on_texts(x_train)

# Converter textos para sequências
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

tokenizer.document_count

12800

In [101]:
# Fazer padding das sequências
maxlen = max(len(tokens) for tokens in x_train_seq)

padded_train_Seq=pad_sequences(x_train_seq,maxlen=maxlen,padding='post')

padded_test_Seq=pad_sequences(x_test_seq,maxlen=maxlen,padding='post')

print("Maximum sequence length (maxlen):", maxlen)

Maximum sequence length (maxlen): 75


In [102]:
# Slicing
padded_train_Seq[:3]

array([[2580,  168, 6247, 4884, 4884,   63,  163,  251, 1233,  654,  225,
         945,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [3724,    2, 1071,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,

In [103]:
padded_test_Seq[:3]

array([[1163, 7015, 4492,  259, 5633,   72,  481, 7016, 7017,    2,  142,
         665,  300,   52, 5116, 1690,  307, 7018, 7019,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [ 140,   11,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,

In [104]:
y_test.value_counts()

IndicadorDepressao
0    1604
1    1596
Name: count, dtype: int64

In [105]:
y_train.value_counts()

IndicadorDepressao
1    6404
0    6396
Name: count, dtype: int64

In [106]:
input_Size = np.max(padded_train_Seq) + 1

input_Size

np.int32(10000)

In [107]:
# Define the model

model = Sequential()



# Use LSTM instead of SimpleRNN for better capturing long-term dependencies

model.add(LSTM(8, input_shape=(75,1), return_sequences=True))  



# Add dropout regularization

model.add(Dropout(0.8))



# Add another LSTM layer

model.add(LSTM(12)) 



# Add dropout regularization

model.add(Dropout(0.7))  



# Add a dense layer with ReLU activation

model.add(Dense(12, activation='relu'))  



# Output layer with sigmoid activation for binary classification

model.add(Dense(1, activation='sigmoid')) 

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['precision', 'recall', 'f1_score'])

  super().__init__(**kwargs)


In [108]:
model.summary()

In [109]:
ES=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

LR=ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)

In [110]:
model.fit(padded_train_Seq,y_train,validation_split=0.2, epochs=50,batch_size=32,callbacks=[ES,LR])

Epoch 1/50


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("sequential_2_1/Cast:0", shape=(32, 75), dtype=float32). Expected shape (None, 50, 100), but input has incompatible shape (32, 75)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 75), dtype=int32)
  • training=True
  • mask=None

In [None]:
_,precision,recall,_=model.evaluate(padded_test_Seq,y_test)

In [None]:
precision

In [None]:
recall

In [None]:
from sklearn.metrics import classification_report



predicted_labels = model.predict(padded_test_Seq)

predicted_labels = (predicted_labels > 0.5).astype(int) 





report = classification_report(y_test, predicted_labels, target_names=['Negative', 'Positive'])



print(report)

In [82]:
def predict_sentiment(new_text):

    new_text = new_text.lower()  

    new_text = remove_URL(new_text)

    new_text= remove_stopwords(new_text)

    new_text= remove_ej(new_text)

    new_text= remove_punc(new_text)

    sequence = tokenizer.texts_to_sequences([new_text])

    padded_sequence = pad_sequences(sequence, maxlen=maxlen, padding='post')

    prediction = model.predict(padded_sequence)

    predicted_label = (prediction > 0.5).astype(int)

    if predicted_label == 1 :
        print('person is depressed')
    else:
        print('person is not depressed')

    return predicted_label, prediction

In [None]:
predict_sentiment("eu quero viver com você")