In [1]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read in y datasets
yTrain = pd.read_csv('yTrain.csv')
yTest = pd.read_csv('yTest.csv')

# Target names for models
target_names = ['Neutral', 'Depression']

In [3]:
# Multinomial Naive Bayes
def MNB(xTrain, xTest, yTrain, yTest, dataset, target_names):
    mnb = MultinomialNB()
    mnb.fit(xTrain, yTrain)
    y_pred_mnb = mnb.predict(xTest)
    print(f"MultinomialNB on {dataset} dataset:")
    print(classification_report(yTest, y_pred_mnb, target_names=target_names))

# Bernoulli Naive Bayes
def BNB(xTrain, xTest, yTrain, yTest, dataset, target_names):
    bnb = BernoulliNB()
    bnb.fit(xTrain, yTrain)
    y_pred_bnb = bnb.predict(xTest)
    print(f"BernoulliNB on {dataset} dataset:")
    print(classification_report(yTest, y_pred_bnb, target_names=target_names))

# KNN
def KNN(xTrain, xTest, yTrain, yTest, dataset, target_names):
    knn = KNeighborsClassifier(metric='euclidean')
    knn.fit(xTrain.to_numpy(), yTrain)
    y_pred_knn = knn.predict(xTest.to_numpy())
    print(f"KNN on {dataset} dataset:")
    print(classification_report(yTest, y_pred_knn, target_names=target_names))

# Perceptron
def PERC(xTrain, xTest, yTrain, yTest, dataset, target_names):
    perc = Perceptron(random_state=42)
    perc.fit(xTrain, yTrain)
    y_pred_perc = perc.predict(xTest)
    print(f"Perceptron on {dataset} dataset:")
    print(classification_report(yTest, y_pred_perc, target_names=target_names))

# Logistic Regression
def LR(xTrain, xTest, yTrain, yTest, dataset, target_names):
    lr = LogisticRegression(max_iter=1000, random_state=42)
    lr.fit(xTrain, yTrain)
    y_pred_lr = lr.predict(xTest)
    print(f"Logistic Regression on {dataset} dataset:")
    print(classification_report(yTest, y_pred_lr, target_names=target_names))

# Decision Tree
def DT(xTrain, xTest, yTrain, yTest, dataset, target_names):
    dt = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_leaf=50)
    dt.fit(xTrain, yTrain)
    y_pred_dt = dt.predict(xTest)
    print(f"Decision Tree on {dataset} dataset:")
    print(classification_report(yTest, y_pred_dt, target_names=target_names))

# Random Forest
def RF(xTrain, xTest, yTrain, yTest, dataset, target_names):
    rf = RandomForestClassifier(random_state=42)
    rf.fit(xTrain, yTrain)
    y_pred_rf = rf.predict(xTest)
    print(f"Random Forest on {dataset} dataset:")
    print(classification_report(yTest, y_pred_rf, target_names=target_names))

# Convolutional Neural Network (CNN)

# Recurrent Neural Network (RNN)

# Hugging Face (Transformer)

In [4]:
# Binary Dataset
binary_df_train = pd.read_csv('binary_df_train.csv')
binary_df_test = pd.read_csv('binary_df_test.csv')

#MNB(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names)
#BNB(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names)
#PERC(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names)
#LR(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names)
DT(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names)

In [None]:
# Bag of Words Dataset
bow_df_train = pd.read_csv('bow_df_train.csv')
bow_df_test = pd.read_csv('bow_df_test.csv')

#MNB(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names)
#BNB(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names)
#PERC(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names)
#LR(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names)
DT(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names)

In [None]:
# Tfidf Dataset
tfidf_df_train = pd.read_csv('tfidf_df_train.csv')
tfidf_df_test = pd.read_csv('tfidf_df_test.csv')

#MNB(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names)
#BNB(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names)
#PERC(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names)
#LR(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names)
DT(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names)

In [None]:
# Hash Dataset
hash_df_train = pd.read_csv('hash_df_train.csv')
hash_df_test = pd.read_csv('hash_df_test.csv')

#BNB(hash_df_train, hash_df_test, yTrain, yTest, 'hashing', target_names)
#PERC(hash_df_train, hash_df_test, yTrain, yTest, 'hashing', target_names)
#LR(hash_df_train, hash_df_test, yTrain, yTest, 'hashing', target_names)
DT(hash_df_train, hash_df_test, yTrain, yTest, 'hashing', target_names)

In [None]:
# LDA Dataset
lda_df_train = pd.read_csv('lda_df_train.csv')
lda_df_test = pd.read_csv('lda_df_test.csv')

#MNB(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names)
#BNB(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names)
#KNN(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names)
#PERC(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names)
#LR(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names)
DT(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names)

In [4]:
# PCA Dataset
pca_df_train = pd.read_csv('pca_df_train.csv')
pca_df_test = pd.read_csv('pca_df_test.csv')

#BNB(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names)
#KNN(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names)
#PERC(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names)
#LR(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names)
DT(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names)

Decision Tree on PCA dataset:
              precision    recall  f1-score   support

     Neutral       0.94      0.97      0.96    479841
  Depression       0.94      0.88      0.91    231867

    accuracy                           0.94    711708
   macro avg       0.94      0.93      0.93    711708
weighted avg       0.94      0.94      0.94    711708

