In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
#Preprocessing function
def preprocessing(data_frame):
    ## Preprocessing
    # Removing URLs whithin the tweets
    data_frame["Text"] = data_frame["Text"].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()
    # Removing emails, hashtags and punctuations
    data_frame['Text'] = data_frame["Text"].str.replace(r'\S*@\S*\s?', ' ').str.strip()
    data_frame['Text'] = data_frame['Text'].str.replace(r'#\S*\s?', ' ').str.strip()
    data_frame['Text'] = data_frame['Text'].str.replace(r'[^\w\s]+', ' ').str.strip()

    # Lowercase Text
    data_frame['Text'] = data_frame['Text'].str.lower()

    # # Removing stopwords
    stop = stopwords.words('english')
    data_frame['Text'].apply(lambda x: [item for item in str(x) if item not in stop])

    # Removing newline characters
    data_frame['Text'] = data_frame['Text'].str.rstrip()

    return data_frame

# Datasets

### Sampling

In [None]:
def sampling(df, key1, key2, pos_label, neg_label):
  sample = []
  pos = 3
  neg = 2
  for item in range(len(df)):
    if pos == 0 and neg == 0:
      break
    elif pos >0:
      if df[key2][item] == pos_label:
        sample.append([df[key1][item], df[key2][item]])
        pos = pos-1
    elif neg >0:
      if df[key2][item] == neg_label:
        sample.append([df[key1][item], df[key2][item]])
        neg -= 1
  
  print(sample)
  df_sample = pd.DataFrame(sample, columns={key1, key2})
  return df_sample

### Twitter 10 000

In [None]:
Path_Twitter_1 = "/Datasets/Twitter/twitter-suicidal_data_10000.csv"
df = pd.read_csv(Path_Twitter_1)
df = df.rename(columns={'tweet':'Text', 'intention':'Label'})

In [None]:
suicidal = []
for item in range(len(df)):
  if df['Label'][item] == 1:
    suicidal.append([df['Text'][item], df['Label'][item]])

suicidal_df = pd.DataFrame(suicidal, columns=['Text', 'Label'])

In [None]:
df = suicidal_df
df

### Twitter Tendency

In [None]:
Path_Twitter_2 = "/MH ML project/Datasets/suicidal-tendency-tweets.csv"
df = pd.read_csv(Path_Twitter_2, encoding='latin-1', usecols=['tweet', 'intention'], nrows = 17142)
df = df.rename(columns={'tweet':'Text', 'intention':'Label'})
df = preprocessing(df)
df.head()

In [None]:
suicidal = []
for item in range(len(df)):
  if df['Label'][item] == 1:
    suicidal.append([df['Text'][item], df['Label'][item]])

suicidal_df = pd.DataFrame(suicidal, columns=['Text', 'Label'])
df = suicidal_df
df

### Reddit SNS

In [None]:
Reddit_path = "/Datasets/Reddit_non suicide  suicide/Suicide_Detection.csv" 
df = pd.read_csv(Reddit_path, encoding='latin-1', usecols=['text', 'class'], nrows=20000)
df = df.rename(columns={'text': 'Text', 'class': 'Label'})

label_dict = {'suicide': 1, 'non-suicide': 0}
df['Label'] = df['Label'].apply(lambda row: label_dict[row])
df['Label']

df

In [None]:
suicidal = []
for item in range(len(df)):
  if df['Label'][item] == 1:
    suicidal.append([df['Text'][item], df['Label'][item]])

suicidal_df = pd.DataFrame(suicidal, columns=['Text', 'Label'])
df = suicidal_df
df

# DATA sampling

In [None]:
df_sample = sampling(df, "text", "label", "SuicideWatch", "depression")

In [None]:
df_sample = df_sample.sample(frac=1)

In [None]:
df_sample

# Analyses

In [None]:
from textblob import TextBlob

## Sentiment analysis1_polarity and subjective

In [None]:
# !pip install textblob

In [None]:
df['polarity'] = df['Text'].apply(lambda x: TextBlob(x).polarity)
df['subjective'] = df['Text'].apply(lambda x: TextBlob(x).subjectivity)

In [None]:
df

In [None]:
save_path = "/Datasets/analysis/"
df.to_csv(save_path+'Reddit_SD_polarity_subjective.csv')

## bigrams/trigrams

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# c_vec = CountVectorizer(stop_words=stopwords, ngram_range=(2,2)) ## bigrams
c_vec = CountVectorizer(stop_words=stopwords, ngram_range=(3,3)) ##trigrams
# matrix of ngrams
ngrams = c_vec.fit_transform(df['Text'])

In [None]:
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)

In [None]:
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [None]:
df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)

In [None]:
df_ngram

In [None]:
save_path = "/Datasets/analysis/"
# df_ngram.to_csv(save_path+'Reddit_SD_bigrams.csv')
df_ngram.to_csv(save_path+'Reddit_SD_trigrams.csv')

In [None]:
words = []
for item in range(len(df_ngram)):
  if df_ngram.polarity[item] > 0:
    words.append(df_ngram['bigram/trigram'][item])

words

## Topic Detection

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(2,3))
nmf = NMF(n_components=5)
pipe = make_pipeline(tfidf_vectorizer, nmf)
pipe.fit(df['Text'])
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), n_top_words=3)

# Sentiments analysis

In [None]:
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores(df['Text'][1000])

In [None]:
semantic_3 = []
sia = SentimentIntensityAnalyzer()
for item in range(len(df['Text'])):
  text = df['Text'][item]
  polarity_score = sia.polarity_scores(text)
  semantic_3.append([ text, polarity_score['neg'], polarity_score['neu'], polarity_score['pos'], polarity_score['compound']])

In [None]:
semantic_3 = pd.DataFrame(semantic_3, columns=['Text', 'neg','neu','pos','compound'])
semantic_3

In [None]:
save_path = "/Datasets/analysis/"
semantic_3.to_csv(save_path+'Reddit_SD_semantic_3.csv')

# Multimodal sentiment analysis

In [None]:
!pip install -q transformers

In [None]:
from transformers import pipeline
data = df['Text'][3000]

from transformers import pipeline
classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
prediction = classifier(data, )
print(prediction)

In [None]:
prediction[0]

In [None]:
semantic_6 = []
classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
for item in range(len(df['Text'])):
  print(item)
  text = df['Text'][item]
  if len(text) > 512:
    hm = len(text)/512
    split_text = []
    for n in range(0,int(hm)+1):
      split_text.append(text[n*512:(n+1)*512])
    print(len(text),len(split_text), hm, int(hm))

    preds = []
    for txt in split_text:
      prediction = classifier(txt, )
      preds.append([prediction[0][0]['score'], prediction[0][1]['score'], prediction[0][2]['score'],
                      prediction[0][3]['score'], prediction[0][4]['score'], prediction[0][5]['score']])
    print(preds)
    scores = [0, 0, 0, 0, 0, 0]
    for pred_ind in preds:
      scores[0] += pred_ind[0]
      scores[1] += pred_ind[1]
      scores[2] += pred_ind[2]
      scores[3] += pred_ind[3]
      scores[4] += pred_ind[4]
      scores[5] += pred_ind[5]
    for index in range(len(scores)):
      scores[index] = scores[index]/len(preds)
    print(scores)
    semantic_6.append([text, scores[0], scores[1], scores[2],
                  scores[3], scores[4], scores[5]])
  else:
    prediction = classifier(text, )
    semantic_6.append([ text, prediction[0][0]['score'], prediction[0][1]['score'], prediction[0][2]['score'],
                      prediction[0][3]['score'], prediction[0][4]['score'], prediction[0][5]['score']])

In [None]:
semantic_6 = pd.DataFrame(semantic_6, columns=['Text', 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise'])
semantic_6

In [None]:
save_path = "/Datasets/analysis/"
semantic_6.to_csv(save_path+'Reddit_SD_semantic_6.csv')