------------------
 ## Suicide Sentiment Analysis Project 
 - Using TF-IDF As Feature Extraction
 - Using Some Classification models As RandomForest, LinearSVC, MultinomialNB
 - Using Some Preprocessing as Lemmatization, Removing Stop Words
 - Finally,  The best Results in this notebook is 91%.
----------------

In [3]:
! pip install neattext

In [4]:
import pandas as pd
import numpy as np 
from tabulate import tabulate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import metrics
nltk.download('vader_lexicon')
import neattext.functions as nfx
import warnings
warnings.filterwarnings('ignore')

## Read Suicide_Detection File

In [5]:
Suicide = pd.read_csv('../input/suicide-watch/Suicide_Detection.csv')
data_split = np.array_split(Suicide, 3)
Suicide = data_split[0]
Suicide = Suicide.drop('Unnamed: 0',axis=1)

## Preparing For Stopword removal and lemmatization

In [6]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [7]:
X = Suicide.drop('class', axis=1)
y = Suicide['class']


# Text Pre Proceessing

In [8]:
# To remove emails
email_regex = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
regexes_to_remove = [email_regex, r'Subject:', r'Re:']

for i in range(0, len(X)):
    # removing all special charachter
    review = re.sub('[^a-zA-Z]', ' ', str(X['text'][i]))
    # make document as lowerCase
    review = review.lower()
    # splitting the documents into words for ex ['iam', 'omar']
    review = review.split()
    # make limmatization --> (change, changing, changes)---> (change)
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    # join the document agian
    review = ' '.join(review)
    
    # removing mails
    for r in regexes_to_remove:
        X['text'][i] = re.sub(r, '', review)


## Data Splitting 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Feature extraction

- ### TF-IDF

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectorizer_n12 = TfidfVectorizer(max_features=10000, ngram_range=(1,2))


X_tfidf_train = tfidf_vectorizer.fit_transform(X_train['text'])
X_tfidf_test = tfidf_vectorizer.transform(X_test['text'])

X_tfidf_train_n12= tfidf_vectorizer_n12.fit_transform(X_train['text'])
X_tfidf_test_n12=tfidf_vectorizer_n12.transform(X_test['text'])

 - ### Bag of wards 

In [11]:
vectorizer = CountVectorizer()
  
X_bow_train = vectorizer.fit_transform(X_train['text'])
X_bow_test = vectorizer.transform(X_test['text'])


- ### vador


In [12]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
def get_vador_scores(data):
    sid=SIA()
    vador_df=data.copy()
    vador_df['scores'] = vador_df['text'].apply(lambda txt: sid.polarity_scores(str(txt)))
    
    vador_df['neg_score'] = vador_df['scores'].apply(lambda txt: txt['neg'])
    vador_df['neu_score'] =vador_df['scores'].apply(lambda txt: txt['neu'])
    vador_df['pos_score'] = vador_df['scores'].apply(lambda txt: txt['pos'])
    vador_df['compound'] = vador_df['scores'].apply(lambda txt: txt['compound'])
    vador_df.drop('scores', axis=1, inplace=True)
    vador_df.drop('text', axis=1, inplace=True)
    return vador_df


In [13]:
X_vador_train = get_vador_scores(X_train)
X_vador_test= get_vador_scores(X_test)

## Feature Selection

In [14]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

# We Can select any model but linearSVC has l1 norm penality which deals with sparse
lsvc = LinearSVC(C=100, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_tfidf_train, y_train)

# This function select the best features that has high weigh
fs = SelectFromModel(lsvc, prefit=True)
# This function redeuce X to the selected features
X_selection = fs.transform(X_tfidf_train)
X_test_selection = fs.transform(X_tfidf_test)


lsvc.fit(X_tfidf_train_n12, y_train)
fs_n12 = SelectFromModel(lsvc, prefit=True)
X_selection_n12 = fs_n12.transform(X_tfidf_train_n12)
X_test_selection_n12 = fs_n12.transform(X_tfidf_test_n12)

lsvc.fit(X_bow_train, y_train)
fs_n12 = SelectFromModel(lsvc, prefit=True)
X_selection_bow = fs_n12.transform(X_bow_train)
X_test_selection_bow = fs_n12.transform(X_bow_test)


### plotting results function 

In [15]:
import matplotlib.pyplot as plt
def plot_results(data):
    
    barWidth = 0.15
    # set heights of bars

    bars1 = [data[0][1],data[1][1], data[2][1]]
    bars2 = [data[0][2], data[1][2], data[2][2]]
    bars3 = [data[0][3], data[1][3], data[2][3]]
    bars4 = [data[0][4], data[1][4], data[2][4]]
    

    # Set position of bar on X axis
    r1 = np.arange(len(bars1))
    r2 = [x + barWidth for x in r1]
    r3 = [x + barWidth for x in r2]
    r4 = [x + barWidth for x in r3]
    

    # Make the plot
    plt.bar(r1, bars1, color='r', width=barWidth, edgecolor='white', label='tfidf')
    plt.bar(r2, bars2, color='b', width=barWidth, edgecolor='white', label='tfidf_n12')
    plt.bar(r3, bars3, color='#2d7f5e', width=barWidth, edgecolor='white', label='bow')
    plt.bar(r4, bars4, color='#9a7f5e', width=barWidth, edgecolor='white', label='vador')
    

    # Add xticks on the middle of the group bars
    plt.xlabel('group', fontweight='bold')
    plt.xticks([r + barWidth for r in range(len(bars1))], ['precision', 'recall', 'f1-score'])

    # Create legend & Show graphic
    plt.legend()
    plt.grid()
    plt.show()

##  Using LinearSVC

In [16]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_selection, y_train)
y_predict_tfidf = lsvc.predict(X_test_selection)

lsvc.fit(X_selection_n12,y_train)
y_predict_tfidf_n12 = lsvc.predict(X_test_selection_n12)

lsvc.fit(X_selection_bow,y_train)
y_predict_bow = lsvc.predict(X_test_selection_bow)

lsvc.fit(X_vador_train,y_train)
y_predict_vador = lsvc.predict(X_vador_test)

# print(metrics.classification_report(y_test, y_predict_1, target_names=['Suicide', 'Non-Suicide']))
linear_svm_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf)
linear_svm_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_n12)
linear_svm_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow)
vador_svm_results=metrics.precision_recall_fscore_support(y_test, y_predict_vador)

In [17]:
data1 = [['TF-IDF','TF-IDF 2-grams ','bag of words','vador'],
         ['precision',linear_svm_tfidf_results[0][0],linear_svm_tfidf_n12_results[0][0],linear_svm_bow_results[0][0],
          vador_svm_results[0][0]],
         ['recall',linear_svm_tfidf_results[1][0],linear_svm_tfidf_n12_results[1][0],linear_svm_bow_results[1][0],
          vador_svm_results[1][0]],
         ['F1-score',linear_svm_tfidf_results[2][0],linear_svm_tfidf_n12_results[2][0],linear_svm_bow_results[2][0],
          vador_svm_results[2][0]]]

In [18]:
print(tabulate(data1,headers='firstrow',tablefmt='fancy_grid'))

In [19]:
plot_results(data1[1:])

## Using RandomForest

In [20]:
clf = RandomForestClassifier(max_depth=10)
clf.fit(X_selection, y_train)
y_predict_tfidf_2 = clf.predict(X_test_selection)

clf.fit(X_selection_n12, y_train)
y_predict_tfidf_n12_2 = clf.predict(X_test_selection_n12)

clf.fit(X_selection_bow, y_train)
y_predict_bow_2 = clf.predict(X_test_selection_bow)

clf.fit(X_vador_train, y_train)
y_predict_vador_2 = clf.predict(X_vador_test)

In [21]:
RandomForest_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_2)
RandomForest_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_n12_2)
RandomForest_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow_2)
RandomForest_vador_results=metrics.precision_recall_fscore_support(y_test, y_predict_vador_2)
# print(metrics.classification_report(y_test, y_predict_2, target_names=['Suicide', 'Non-Suicide']))

In [22]:
data2 = [['TF-IDF','TF-IDF 2-grams ','bag of words','vador'],
         ['precision',RandomForest_tfidf_results[0][0],RandomForest_tfidf_n12_results[0][0],RandomForest_bow_results[0][0],
          RandomForest_vador_results[0][0]],
         ['recall',RandomForest_tfidf_results[1][0],RandomForest_tfidf_n12_results[1][0],RandomForest_bow_results[1][0],
          RandomForest_vador_results[1][0]],
         ['F1-score',RandomForest_tfidf_results[2][0],RandomForest_tfidf_n12_results[2][0],RandomForest_bow_results[2][0],
          RandomForest_vador_results[2][0]]]

In [23]:
print(tabulate(data2,headers='firstrow',tablefmt='fancy_grid'))

In [24]:
plot_results(data2[1:])

## Using Ensamble Learning 

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression,  SGDClassifier
from sklearn.svm import SVC

In [26]:
log_clf = LogisticRegression(solver="lbfgs")
rnd_clf = RandomForestClassifier(n_estimators=100)
svm_clf = SVC(gamma="scale", probability=True)
sgd = SGDClassifier(alpha=.0001, max_iter=50, loss='log',
                                       penalty="elasticnet", n_jobs=-1)
decision_tree_clf = DecisionTreeClassifier(max_depth = 3)

voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('svc', svm_clf),('dt',decision_tree_clf)],
voting='soft')



In [27]:
voting_clf.fit(X_selection, y_train)
y_predict_tfidf_4 = voting_clf.predict(X_test_selection)

voting_clf.fit(X_selection_n12, y_train)
y_predict_tfidf_n12_4 = voting_clf.predict(X_test_selection_n12)

voting_clf.fit(X_selection_bow, y_train)
y_predict_bow_4 = voting_clf.predict(X_test_selection_bow)

voting_clf.fit(X_vador_train, y_train)
y_predict_vador_4 = voting_clf.predict(X_vador_test)

In [28]:
# print(metrics.classification_report(y_test, y_predict_4, target_names=['Suicide', 'Non-Suicide']))
mb_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_4)
mb_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_n12_4)
mb_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow_4)
mb_vador_results=metrics.precision_recall_fscore_support(y_test, y_predict_vador_4)

In [29]:
data3= [['TF-IDF','TF-IDF 2-grams ','bag of words','vador'],
        ['precision',mb_tfidf_results[0][0],mb_tfidf_n12_results[0][0],mb_bow_results[0][0],
          mb_vador_results[0][0]],
         ['recall',mb_tfidf_results[1][0],mb_tfidf_n12_results[1][0],mb_bow_results[1][0],
          mb_vador_results[1][0]],
         ['F1-score',mb_tfidf_results[2][0],mb_tfidf_n12_results[2][0],mb_bow_results[2][0],
          mb_vador_results[2][0]]]

In [30]:
print(tabulate(data3,headers='firstrow',tablefmt='fancy_grid'))

In [31]:
plot_results(data3[1:])

## Using Bagging Method

In [32]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_selection, y_train)
y_pred_5 = bag_clf.predict(X_test_selection)

bag_clf.fit(X_selection_n12, y_train)
y_pred_n12_5 = bag_clf.predict(X_test_selection_n12)

bag_clf.fit(X_selection_bow, y_train)
y_pred_bow_5 = bag_clf.predict(X_test_selection_bow)

bag_clf.fit(X_vador_train, y_train)
y_pred_vador_5 = bag_clf.predict(X_vador_test)
# print(metrics.classification_report(y_test, y_pred_5, target_names=['Suicide', 'Non-Suicide']))

In [33]:
bag_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_pred_5)
bag_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_pred_n12_5)
bag_bow_results=metrics.precision_recall_fscore_support(y_test, y_pred_bow_5)
bag_vador_results=metrics.precision_recall_fscore_support(y_test, y_pred_vador_5)

In [34]:
data4= [['TF-IDF','TF-IDF 2-grams ','bag of words','vador'],
        ['precision',bag_tfidf_results[0][0],bag_tfidf_n12_results[0][0],bag_bow_results[0][0],
          bag_vador_results[0][0]],
         ['recall',bag_tfidf_results[1][0],bag_tfidf_n12_results[1][0],bag_bow_results[1][0],
          bag_vador_results[1][0]],
         ['F1-score',bag_tfidf_results[2][0],bag_tfidf_n12_results[2][0],bag_bow_results[2][0],
          bag_vador_results[2][0]]]

In [35]:
print(tabulate(data4,headers='firstrow',tablefmt='fancy_grid'))

In [36]:
plot_results(data4[1:])