------------------
 ## Suicide Sentiment Analysis Project 
 - Using TF-IDF As Feature Extraction
 - Using Some Classification models As RandomForest, LinearSVC, MultinomialNB
 - Using Some Preprocessing as Lemmatization, Removing Stop Words
 - Finally,  The best Results in this notebook is 91%.
----------------

In [1]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import metrics

## Read Suicide_Detection File

In [2]:
Suicide = pd.read_csv('Suicide_Detection.csv')
data_split = np.array_split(Suicide, 20)
Suicide = data_split[0]
Suicide = Suicide.drop('Unnamed: 0',axis=1)

## Preparing For Stopword removal and lemmatization

In [3]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/saied/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
X = Suicide.drop('class', axis=1)
y = Suicide['class']

# Text Pre Proceessing

In [5]:
# To remove emails
email_regex = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
regexes_to_remove = [email_regex, r'Subject:', r'Re:']

for i in range(0, len(X)):
    # removing all special charachter
    review = re.sub('[^a-zA-Z]', ' ', str(X['text'][i]))
    # make document as lowerCase
    review = review.lower()
    # splitting the documents into words for ex ['iam', 'omar']
    review = review.split()
    # make limmatization --> (change, changing, changes)---> (change)
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    # join the document agian
    review = ' '.join(review)
    
    # removing mails
    for r in regexes_to_remove:
        X['text'][i] = re.sub(r, '', review)


<a id='another_cell'></a>
### -ttttttttt

[TF-IDF ](#another_cell)

## Splitting Data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Feature extraction

- ### TF-IDF

In [7]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectorizer_n12 = TfidfVectorizer(max_features=10000, ngram_range=(1,2))


X_tfidf_train = tfidf_vectorizer.fit_transform(X_train['text'])
X_tfidf_test = tfidf_vectorizer.transform(X_test['text'])

X_tfidf_train_n12= tfidf_vectorizer_n12.fit_transform(X_train['text'])
X_tfidf_test_n12=tfidf_vectorizer_n12.fit_transform(X_test['text'])

In [8]:
print(X_tfidf_train.shape)

(8122, 10000)


## Feature Scaling

---------------
- As we see the no. of features very large so we need to make feature selection and feature scaling
------------

## Feature Selection

In [9]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

# We Can select any model but linearSVC has l1 norm penality which deals with sparse
lsvc = LinearSVC(C=100, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_tfidf_train, y_train)

# This function select the best features that has high weigh
fs = SelectFromModel(lsvc, prefit=True)
# This function redeuce X to the selected features
X_selection = fs.transform(X_tfidf_train)
X_test_selection = fs.transform(X_tfidf_test)


lsvc.fit(X_tfidf_train_n12, y_train)
fs_n12 = SelectFromModel(lsvc, prefit=True)
X_selection_n12 = fs_n12.transform(X_tfidf_train_n12)
X_test_selection_n12 = fs_n12.transform(X_tfidf_test_n12)




##  Using LinearSVC

In [10]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_selection, y_train)
y_predict = lsvc.predict(X_test_selection)

lsvc_n12 = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc_n12.fit(X_selection_n12,y_train)
y_predict_n12 = lsvc_n12.predict(X_test_selection_n12)
# print(metrics.classification_report(y_test, y_predict_1, target_names=['Suicide', 'Non-Suicide']))
linear_svm_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict)
linear_svm_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_n12)



In [11]:
 print(metrics.classification_report(y_test, y_predict, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.87      0.85      0.86      1756
 Non-Suicide       0.85      0.87      0.86      1726

    accuracy                           0.86      3482
   macro avg       0.86      0.86      0.86      3482
weighted avg       0.86      0.86      0.86      3482



In [12]:
linear_svm_tfidf_n12_results

(array([0.51930502, 0.53923767]),
 array([0.76594533, 0.27867903]),
 array([0.61895996, 0.36745607]),
 array([1756, 1726]))

## Using RandomForest

In [13]:
clf = RandomForestClassifier(max_depth=10)
clf.fit(X_selection, y_train)
y_predict_2 = clf.predict(X_test_selection)

clf.fit(X_selection_n12, y_train)
y_predict_n12_2 = clf.predict(X_test_selection_n12)

In [14]:
RandomForest_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict)
RandomForest_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_n12)
print(metrics.classification_report(y_test, y_predict_2, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.80      0.89      0.84      1756
 Non-Suicide       0.88      0.77      0.82      1726

    accuracy                           0.83      3482
   macro avg       0.84      0.83      0.83      3482
weighted avg       0.84      0.83      0.83      3482



In [15]:
RandomForest_tfidf_n12_results

(array([0.51930502, 0.53923767]),
 array([0.76594533, 0.27867903]),
 array([0.61895996, 0.36745607]),
 array([1756, 1726]))

## Using Multinomial Naive Bayes

In [19]:
mb = MultinomialNB()
mb.fit(X_selection, y_train)
y_predict_3 = mb.predict(X_test_selection)

mb.fit(X_selection_n12, y_train)
y_predict_n12_3 = mb.predict(X_test_selection_n12)
print(metrics.classification_report(y_test, y_predict_3, target_names=['Suicide', 'Non-Suicide']))

mb_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict)
mb_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_n12)

              precision    recall  f1-score   support

     Suicide       0.94      0.81      0.87      1756
 Non-Suicide       0.83      0.95      0.88      1726

    accuracy                           0.88      3482
   macro avg       0.89      0.88      0.88      3482
weighted avg       0.89      0.88      0.88      3482



In [20]:
mb_tfidf_n12_results

(array([0.51930502, 0.53923767]),
 array([0.76594533, 0.27867903]),
 array([0.61895996, 0.36745607]),
 array([1756, 1726]))

## Using Ensamble Learning 

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression,  SGDClassifier
from sklearn.svm import SVC

In [22]:
log_clf = LogisticRegression(solver="lbfgs")
rnd_clf = RandomForestClassifier(n_estimators=100)
svm_clf = SVC(gamma="scale", probability=True)
mb = MultinomialNB()
sgd = SGDClassifier(alpha=.0001, max_iter=50, loss='log',
                                       penalty="elasticnet", n_jobs=-1)

voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf), ('mb', mb),('sgd', sgd)],
voting='soft')



In [23]:
voting_clf.fit(X_selection, y_train)
y_predict_4 = voting_clf.predict(X_test_selection)

voting_clf.fit(X_selection_n12, y_train)
y_predict_n12_4 = voting_clf.predict(X_test_selection_n12)

In [24]:
print(metrics.classification_report(y_test, y_predict_4, target_names=['Suicide', 'Non-Suicide']))
mb_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict_4)
mb_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_n12_4)

              precision    recall  f1-score   support

     Suicide       0.91      0.92      0.91      1756
 Non-Suicide       0.92      0.90      0.91      1726

    accuracy                           0.91      3482
   macro avg       0.91      0.91      0.91      3482
weighted avg       0.91      0.91      0.91      3482



In [25]:
mb_tfidf_n12_results

(array([0.50522952, 0.575     ]),
 array([0.99031891, 0.01332561]),
 array([0.6691035 , 0.02604757]),
 array([1756, 1726]))

## Using Bagging Method

In [26]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_selection, y_train)
y_pred_5 = bag_clf.predict(X_test_selection)

bag_clf.fit(X_selection_n12, y_train)
y_pred_n12_5 = bag_clf.predict(X_test_selection_n12)
print(metrics.classification_report(y_test, y_pred_5, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.80      0.87      0.83      1756
 Non-Suicide       0.85      0.78      0.82      1726

    accuracy                           0.82      3482
   macro avg       0.83      0.82      0.82      3482
weighted avg       0.83      0.82      0.82      3482



In [28]:
bag_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_pred_5)
bag_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_pred_n12_5)