In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('combined-selftext.csv')
#df.head()

In [3]:
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]:
df['text'] = str_join(df," ", 'title', 'usertext')

In [5]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
STOPWORDS = STOPWORDS.union(set(['im', 'ive', 'ill', 'wa', 'ha', 'aint', 'thats', 'la', 'le', 'please', 'feel', 'rly', 'u', 'nan', 'emptypost']))

stop = STOPWORDS
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [6]:
del df['title']
del df['usertext']

In [7]:
df['text'].dropna(inplace=True)
# 2. Changing all text to lowercase
df['text_original'] = df['text']
df['text'] = [entry.lower() for entry in df['text']]
# 3. Tokenization-In this each entry in the corpus will be broken into set of words
df['text']= [word_tokenize(entry) for entry in df['text']]
# 4. Remove Stop words, Non-Numeric and perfoming Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

df.head()

Unnamed: 0,y,text,text_original
0,0,"[need, help, hi, know, phrase, situation, try,...",need help hi know phrase situation try life go...
1,1,"[feeling, overwhelmed, hopeless, depressed, pa...",feeling overwhelmed hopeless depressed past co...
2,0,"[matter, anymore, getting, worse, hi, know, de...",matter anymore getting worse hi know devastate...
3,1,"[tired, hearing, bullshit, shit, like, better,...",tired hearing bullshit shit like better purpos...
4,0,"[wish, wish, prettier, wish, like, burden, wis...",wish wish prettier wish like burden wish broke...


In [8]:
for index,entry in enumerate(df['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

In [9]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df['text_final'],df['y'],test_size=0.2,random_state=42)

In [10]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [11]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(df['text_final'])
X_train = Tfidf_vect.transform(X_train)
X_test = Tfidf_vect.transform(X_test)

In [13]:
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [14]:
#Multinomial Naive Bayes

MNB = naive_bayes.MultinomialNB()
MNB.fit(X_train, y_train)
predictions_MNB = MNB.predict(X_test)

print(classification_report(y_test, predictions_MNB))
print("MultinomialNB Accuracy Score:",accuracy_score(predictions_MNB, y_test)*100)
print("MultinomialNB Precision Score:",precision_score(predictions_MNB, y_test)*100)
print("MultinomialNB Recall Score:",recall_score(predictions_MNB, y_test)*100)
print("MultinomialNB F1-score Score:",f1_score(predictions_MNB, y_test)*100)

              precision    recall  f1-score   support

           0       0.77      0.48      0.59       198
           1       0.59      0.84      0.69       177

    accuracy                           0.65       375
   macro avg       0.68      0.66      0.64       375
weighted avg       0.69      0.65      0.64       375

MultinomialNB Accuracy Score: 65.06666666666666
MultinomialNB Precision Score: 84.18079096045197
MultinomialNB Recall Score: 59.12698412698413
MultinomialNB F1-score Score: 69.46386946386947


In [15]:
#Support Vector Machine

from sklearn import svm
SVM = svm.SVC()
SVM.fit(X_train, y_train)
predictions_SVM = SVM.predict(X_test)

print(classification_report(y_test, predictions_SVM))
print("SVM Accuracy Score:",accuracy_score(predictions_SVM, y_test)*100)
print("SVM Precision Score:",precision_score(predictions_SVM, y_test)*100)
print("SVM Recall Score:",recall_score(predictions_SVM, y_test)*100)
print("SVM F1-score Score:",f1_score(predictions_SVM, y_test)*100)

              precision    recall  f1-score   support

           0       0.75      0.64      0.69       198
           1       0.65      0.76      0.70       177

    accuracy                           0.69       375
   macro avg       0.70      0.70      0.69       375
weighted avg       0.70      0.69      0.69       375

SVM Accuracy Score: 69.33333333333334
SVM Precision Score: 75.70621468926554
SVM Recall Score: 65.0485436893204
SVM F1-score Score: 69.9738903394256


In [16]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
predictions_RF = RF.predict(X_test)

print(classification_report(y_test, predictions_RF))
print("RF Accuracy Score:",accuracy_score(predictions_RF, y_test)*100)
print("RF Precision Score:",precision_score(predictions_RF, y_test)*100)
print("RF Recall Score:",recall_score(predictions_RF, y_test)*100)
print("RF F1-score Score:",f1_score(predictions_RF, y_test)*100)

              precision    recall  f1-score   support

           0       0.71      0.57      0.63       198
           1       0.60      0.74      0.66       177

    accuracy                           0.65       375
   macro avg       0.66      0.65      0.65       375
weighted avg       0.66      0.65      0.65       375

RF Accuracy Score: 64.8
RF Precision Score: 74.01129943502825
RF Recall Score: 60.36866359447005
RF F1-score Score: 66.49746192893402


In [17]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
predictions_LR = LR.predict(X_test)

print(classification_report(y_test, predictions_LR))
print("Logistic Regression Accuracy Score:",accuracy_score(predictions_LR, y_test)*100)
print("Logistic Regression Precision Score:",precision_score(predictions_LR, y_test)*100)
print("Logistic Regression Recall Score:",recall_score(predictions_LR, y_test)*100)
print("Logistic Regression F1-score Score:",f1_score(predictions_LR, y_test)*100)

              precision    recall  f1-score   support

           0       0.74      0.64      0.68       198
           1       0.65      0.75      0.70       177

    accuracy                           0.69       375
   macro avg       0.69      0.69      0.69       375
weighted avg       0.70      0.69      0.69       375

Logistic Regression Accuracy Score: 69.06666666666666
Logistic Regression Precision Score: 75.14124293785311
Logistic Regression Recall Score: 64.8780487804878
Logistic Regression F1-score Score: 69.63350785340315


In [19]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
predictions_DT = DT.predict(X_test)

print(classification_report(y_test, predictions_DT))
print("Decision Tree Accuracy Score:",accuracy_score(predictions_DT, y_test)*100)
print("Decision Tree Precision Score:",precision_score(predictions_DT, y_test)*100)
print("Decision Tree Recall Score:",recall_score(predictions_DT, y_test)*100)
print("Decision Tree F1-Score:",f1_score(predictions_DT, y_test)*100)

              precision    recall  f1-score   support

           0       0.64      0.64      0.64       198
           1       0.60      0.60      0.60       177

    accuracy                           0.62       375
   macro avg       0.62      0.62      0.62       375
weighted avg       0.62      0.62      0.62       375

Decision Tree Accuracy Score: 62.133333333333326
Decision Tree Accuracy Score: 60.451977401129945
Decision Tree Accuracy Score: 59.77653631284916
Decision Tree Accuracy Score: 60.1123595505618


In [18]:
#Light Gradient Boosting

import lightgbm as lgb
LGB = lgb.LGBMClassifier()
LGB.fit(X_train, y_train)
predictions_LGB = LGB.predict(X_test)

print("LGB Accuracy Score:",accuracy_score(predictions_LGB, y_test)*100)
print("LGB Precision Score:",precision_score(predictions_LGB, y_test)*100)
print("LGB Recall Score:",recall_score(predictions_LGB, y_test)*100)
print("LGB F1-score Score:",f1_score(predictions_LGB, y_test)*100)

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


              precision    recall  f1-score   support

           0       0.73      0.63      0.67       198
           1       0.64      0.74      0.69       177

    accuracy                           0.68       375
   macro avg       0.68      0.68      0.68       375
weighted avg       0.69      0.68      0.68       375

LGB Accuracy Score: 68.0
LGB Precision Score: 74.01129943502825
LGB Recall Score: 63.90243902439025
LGB F1-score Score: 68.58638743455498


In [None]:
#!pip install mlxtend

In [22]:
from mlxtend.classifier import StackingClassifier
clf_stack = StackingClassifier(classifiers =[LGB, XGB, RF, SVM], meta_classifier = LR)
model_stack = clf_stack.fit(X_train, y_train) # training of stacked model
pred_stack = model_stack.predict(X_test)	 # predictions on test data using stacked model
acc_stack = accuracy_score(y_test, pred_stack)
precision_stack = precision_score(y_test, pred_stack)
recall_stack = recall_score(y_test, pred_stack)
f1_stack = f1_score(y_test, pred_stack)# evaluating accuracy
print('accuracy score of Stacked model:', acc_stack * 100)
print('precision score of Stacked model:', precision_stack * 100)
print('recall score of Stacked model:', recall_stack * 100)
print('f1 score of Stacked model:', f1_stack * 100)
print(classification_report(y_test,pred_stack))

accuracy score of Stacked model: 68.8
precision score of Stacked model: 64.28571428571429
recall score of Stacked model: 76.27118644067797
f1 score of Stacked model: 69.76744186046511
              precision    recall  f1-score   support

           0       0.75      0.62      0.68       198
           1       0.64      0.76      0.70       177

    accuracy                           0.69       375
   macro avg       0.69      0.69      0.69       375
weighted avg       0.70      0.69      0.69       375

