------------------
 ## Suicide Sentiment Analysis Project 
 - Using TF-IDF As Feature Extraction
 - Using Some Classification models As RandomForest, LinearSVC, MultinomialNB
 - Using Some Preprocessing as Lemmatization, Removing Stop Words
 - Finally The Results are around 87% which is good. 

----------------

In [27]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import metrics

## Read Suicide_Detection File

In [28]:
Suicide = pd.read_csv('Suicide_Detection.csv')
data_split = np.array_split(Suicide, 20)
Suicide = data_split[0]
Suicide = Suicide.drop('Unnamed: 0',axis=1)

## Preparing For Stopword removal and lemmatization

In [29]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/omar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
X = Suicide.drop('class', axis=1)
y = Suicide['class']

# Text Pre Proceessing

In [31]:
# To remove emails
email_regex = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
regexes_to_remove = [email_regex, r'Subject:', r'Re:']

for i in range(0, len(X)):
    # removing all special charachter
    review = re.sub('[^a-zA-Z]', ' ', str(X['text'][i]))
    # make document as lowerCase
    review = review.lower()
    # splitting the documents into words for ex ['iam', 'omar']
    review = review.split()
    # make limmatization --> (change, changing, changes)---> (change)
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    # join the document agian
    review = ' '.join(review)
    
    # removing mails
    for r in regexes_to_remove:
        X['text'][i] = re.sub(r, '', review)


## Splitting Data

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Feature extraction

In [33]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)


X_tfidf_train = tfidf_vectorizer.fit_transform(X_train['text'])
X_tfidf_test = tfidf_vectorizer.transform(X_test['text'])

In [34]:
print(X_tfidf_train.shape)

(8122, 10000)


---------------
- As we see the no. of features very large so we need to make feature selection and feature scaling
------------

## Feature Scaling

In [35]:
scaler = MinMaxScaler()
# function to fit data(calculate the min and max) then transform data to it
X_norm = scaler.fit_transform(X_tfidf_train.toarray())
X_test_norm = scaler.transform(X_tfidf_test.toarray())


## Feature Selection

In [36]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

# We Can select any model but linearSVC has l1 norm penality which deals with sparse
lsvc = LinearSVC(C=100, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_norm, y_train)

# This function select the best features that has high weigh
fs = SelectFromModel(lsvc, prefit=True)
# This function redeuce X to the selected features
X_selection = fs.transform(X_norm)
X_test_selection = fs.transform(X_test_norm)



##  Using LinearSVC

In [40]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_selection, y_train)
y_predict_1 = lsvc.predict(X_test_selection)
print(metrics.classification_report(y_test, y_predict_1, target_names=['Suicide', 'Non-Suicide']))

(3482,)
(3482,)
              precision    recall  f1-score   support

     Suicide       0.86      0.84      0.85      1728
 Non-Suicide       0.85      0.86      0.86      1754

    accuracy                           0.85      3482
   macro avg       0.85      0.85      0.85      3482
weighted avg       0.85      0.85      0.85      3482





## Using RandomForest

In [41]:
clf = RandomForestClassifier(max_depth=10)
clf.fit(X_selection, y_train)

RandomForestClassifier(max_depth=10)

In [43]:
y_predict_2 = clf.predict(X_test_selection)
print(metrics.classification_report(y_test, y_predict_2, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.79      0.91      0.85      1728
 Non-Suicide       0.89      0.77      0.83      1754

    accuracy                           0.84      3482
   macro avg       0.84      0.84      0.84      3482
weighted avg       0.84      0.84      0.84      3482



## Using Multinomial Naive Bayes

In [44]:
mb = MultinomialNB()
mb.fit(X_selection, y_train)
y_predict_3 = clf.predict(X_test_selection)
print(metrics.classification_report(y_test, y_predict_3, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.79      0.91      0.85      1728
 Non-Suicide       0.89      0.77      0.83      1754

    accuracy                           0.84      3482
   macro avg       0.84      0.84      0.84      3482
weighted avg       0.84      0.84      0.84      3482

