In [2]:
import pandas as pd
import numpy as np
import re
import string
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler


In [3]:
# Carregar o dataset

df=pd.read_csv('sentiment_tweets3.csv')

df.head(10)

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@SilkCharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½C possible today. Nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@daNanner Night, darlin'! Sweet dreams to you",0


In [4]:
# Verificar o tamanho do dataset
df.shape

(10314, 3)

In [5]:
# Verificar se há valores nulos
df.isna().sum()

Index                        0
message to examine           0
label (depression result)    0
dtype: int64

In [6]:
# Renomear as colunas
df.columns=['Indice','Texto','IndicadorDepressao']
df.head(10)


Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@SilkCharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½C possible today. Nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@daNanner Night, darlin'! Sweet dreams to you",0


In [7]:
# Verificar o tipo de dados de cada coluna
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Indice              10314 non-null  int64 
 1   Texto               10314 non-null  object
 2   IndicadorDepressao  10314 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 241.9+ KB


In [8]:
# Verificar a quantidade de valores únicos na coluna 'Texto'
df['Texto'].nunique()

10282

In [9]:
# Converter o texto para minúsculas
df['Texto']=df['Texto'].str.lower()
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat need to send 'em to my accountant tomo...,0
4,540,add me on myspace!!! myspace.com/lookthunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@silkcharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½c possible today. nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@dananner night, darlin'! sweet dreams to you",0


In [10]:
# Remover URLs
def remove_URL(text):
    return re.sub(r'http\S+|www\S' , '',text)

df['Texto'] = df['Texto'].apply(remove_URL)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga,0
2,220,@comeagainjen -,0
3,288,@lapcat need to send 'em to my accountant tomo...,0
4,540,add me on myspace!!! myspace.com/lookthunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@silkcharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½c possible today. nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@dananner night, darlin'! sweet dreams to you",0


In [11]:
# Remover pontuação
stri=string.punctuation
stri 

def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))

df['Texto']=df['Texto'].apply(remove_punc)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,just had a real good moment i missssssssss him...,0
1,217,is reading manga,0
2,220,comeagainjen,0
3,288,lapcat need to send em to my accountant tomorr...,0
4,540,add me on myspace myspacecomlookthunder,0
5,624,so sleepy good times tonight though,0
6,701,silkcharm re nbn as someone already said does ...,0
7,808,23 or 24ï¿½c possible today nice,0
8,1193,nite twitterville workout in the am ciao,0
9,1324,dananner night darlin sweet dreams to you,0


In [12]:
#importa o dicionário de abreviações e gírias comuns de chat/mensagens
from chatWords import chat_words

#Substitui abreviações e gírias comuns de chat/mensagens por suas formas completas.
def chat_word(text):
    for a,b in chat_words.items():
        text=text.replace(a,b)
    return text

df['Texto']=df['Texto'].apply(chat_word)

In [13]:
#Remover stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['Texto']=df['Texto'].apply(remove_stopwords)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,real good moment missssssssss much,0
1,217,reading manga,0
2,220,comeagainjen,0
3,288,lapcat need send em accountant tomorrow oddly ...,0
4,540,add myspace myspacecomlookthunder,0
5,624,sleepy good times tonight though,0
6,701,silkcharm nbn someone already said fiber home ...,0
7,808,23 24ï¿½c possible today nice,0
8,1193,nite twitterville workout ciao,0
9,1324,dananner night darlin sweet dreams,0


In [14]:
# Remover emojis
def remove_ej(text):
    return emoji.demojize(text)

df['Texto'] = df['Texto'].apply(remove_ej)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao
0,106,real good moment missssssssss much,0
1,217,reading manga,0
2,220,comeagainjen,0
3,288,lapcat need send em accountant tomorrow oddly ...,0
4,540,add myspace myspacecomlookthunder,0
5,624,sleepy good times tonight though,0
6,701,silkcharm nbn someone already said fiber home ...,0
7,808,23 24ï¿½c possible today nice,0
8,1193,nite twitterville workout ciao,0
9,1324,dananner night darlin sweet dreams,0


In [15]:
# Lematização
wordNet=WordNetLemmatizer()
def apply_lemmatization(text):
    words = text.split()
    return ' '.join([wordNet.lemmatize(word,pos='v') for word in words])

df['LematizaçãoTexto'] = df['Texto'].apply(apply_lemmatization)
df.head(10)

Unnamed: 0,Indice,Texto,IndicadorDepressao,LematizaçãoTexto
0,106,real good moment missssssssss much,0,real good moment missssssssss much
1,217,reading manga,0,read manga
2,220,comeagainjen,0,comeagainjen
3,288,lapcat need send em accountant tomorrow oddly ...,0,lapcat need send em accountant tomorrow oddly ...
4,540,add myspace myspacecomlookthunder,0,add myspace myspacecomlookthunder
5,624,sleepy good times tonight though,0,sleepy good time tonight though
6,701,silkcharm nbn someone already said fiber home ...,0,silkcharm nbn someone already say fiber home m...
7,808,23 24ï¿½c possible today nice,0,23 24ï¿½c possible today nice
8,1193,nite twitterville workout ciao,0,nite twitterville workout ciao
9,1324,dananner night darlin sweet dreams,0,dananner night darlin sweet dream


In [16]:
# Etapa 1: Vetorização do texto
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['LematizaçãoTexto'])
Y = df['IndicadorDepressao']

In [17]:
# Etapa 2: Separar dados para treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [18]:
# Etapa 3: Treinar o modelo Random Forest
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

In [19]:
# Etapa 4: Avaliar o modelo
y_pred = model.predict(X_test)
print("Relatório de Classificação:")
print(classification_report(y_test, y_pred))

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2011
           1       0.84      0.97      0.90       568

    accuracy                           0.95      2579
   macro avg       0.91      0.96      0.93      2579
weighted avg       0.96      0.95      0.95      2579



In [20]:
# Etapa 5: Testar com uma nova frase
new_text = ["Today the world feels lighter, as if everything is finally in the right place."]
X_new = vectorizer.transform(new_text)
prediction = model.predict(X_new)
print(f"Frase: {new_text[0]}")
print("Classificação:", "Depressivo" if prediction[0] == 1 else "Não depressivo")

Frase: Today the world feels lighter, as if everything is finally in the right place.
Classificação: Não depressivo
