In [None]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_recall_curve, auc
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read in y datasets
yTrain = pd.read_csv('yTrain.csv').head(100000)
yTest = pd.read_csv('yTest.csv').head(100000)

# Target names for models
target_names = ['Neutral', 'Depression']

In [None]:
# Multinomial Naive Bayes
def MNB(xTrain, xTest, yTrain, yTest, dataset, target_names):
    mnb = MultinomialNB()
    mnb.fit(xTrain, yTrain)

    # Make predictions
    y_pred_mnb = mnb.predict(xTest)
    y_pred_prob_mnb = mnb.predict_proba(xTest)[:, 1]  # Probabilities for the positive class

    # Print classification report
    print(f"MultinomialNB on {dataset} dataset:")
    print(classification_report(yTest, y_pred_mnb, target_names=target_names))

    # Calculate precision, recall, and thresholds for AUPRC
    precision, recall, _ = precision_recall_curve(yTest, y_pred_prob_mnb)

    # Calculate AUPRC
    auprc = auc(recall, precision)
    print(f"AUPRC (Area Under Precision-Recall Curve): {auprc:.4f}")

# Bernoulli Naive Bayes
def BNB(xTrain, xTest, yTrain, yTest, dataset, target_names):
    bnb = BernoulliNB()
    bnb.fit(xTrain, yTrain)
    y_pred_bnb = bnb.predict(xTest)

    y_pred_prob_bnb = bnb.predict_proba(xTest)[:, 1]  # Probabilities for the positive class

    # Print classification report
    print(f"BernoulliNB on {dataset} dataset:")
    print(classification_report(yTest, y_pred_bnb, target_names=target_names))

    # Calculate precision, recall, and thresholds for AUPRC
    precision, recall, _ = precision_recall_curve(yTest, y_pred_prob_bnb)

    # Calculate AUPRC
    auprc = auc(recall, precision)
    print(f"AUPRC (Area Under Precision-Recall Curve): {auprc:.4f}")

# KNN
def KNN(xTrain, xTest, yTrain, yTest, dataset, target_names, k, metric):
    knn = KNeighborsClassifier(metric=metric, n_neighbors=k)
    knn.fit(xTrain.to_numpy(), yTrain)
    y_pred_knn = knn.predict(xTest.to_numpy())
    y_pred_prob_knn = knn.predict_proba(xTest)[:, 1]  # Probabilities for the positive class

    # Print classification report
    print(f"KNN on {dataset} dataset:")
    print(classification_report(yTest, y_pred_knn, target_names=target_names))

    # Calculate precision, recall, and thresholds for AUPRC
    precision, recall, _ = precision_recall_curve(yTest, y_pred_prob_knn)

    # Calculate AUPRC
    auprc = auc(recall, precision)
    print(f"AUPRC (Area Under Precision-Recall Curve): {auprc:.4f}")

# Perceptron
def PERC(xTrain, xTest, yTrain, yTest, dataset, target_names, eta0, max_iter, penalty, tol):
    perc = Perceptron(eta0=eta0, max_iter=max_iter, penalty=penalty, tol=tol, random_state=42)
    perc.fit(xTrain, yTrain)
    y_pred_perc = perc.predict(xTest)
    print(f"Perceptron on {dataset} dataset:")
    print(classification_report(yTest, y_pred_perc, target_names=target_names))

# Logistic Regression
def LR(xTrain, xTest, yTrain, yTest, dataset, target_names, penalty, tol, solver, max_iters):
    lr = LogisticRegression(max_iter=max_iters, penalty=penalty, tol=tol, solver=solver, random_state=42)
    lr.fit(xTrain, yTrain)
    y_pred_lr = lr.predict(xTest)
    y_pred_prob_lr = lr.predict_proba(xTest)[:, 1]  # Probabilities for the positive class

    # Print classification report
    print(f"Logistic Regression on {dataset} dataset:")
    print(classification_report(yTest, y_pred_lr, target_names=target_names))

    # Calculate precision, recall, and thresholds for AUPRC
    precision, recall, _ = precision_recall_curve(yTest, y_pred_prob_lr)

    # Calculate AUPRC
    auprc = auc(recall, precision)
    print(f"AUPRC (Area Under Precision-Recall Curve): {auprc:.4f}")

# Decision Tree
def DT(xTrain, xTest, yTrain, yTest, dataset, target_names, min_sample_split, min_samples_leaf, max_depth):
    dt = DecisionTreeClassifier(random_state=42, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_sample_split)
    dt.fit(xTrain, yTrain)
    y_pred_dt = dt.predict(xTest)
    y_pred_prob_dt = dt.predict_proba(xTest)[:, 1]  # Probabilities for the positive class

    # Print classification report
    print(f"Decision Tree on {dataset} dataset:")
    print(classification_report(yTest, y_pred_dt, target_names=target_names))

    # Calculate precision, recall, and thresholds for AUPRC
    precision, recall, _ = precision_recall_curve(yTest, y_pred_prob_dt)

    # Calculate AUPRC
    auprc = auc(recall, precision)
    print(f"AUPRC (Area Under Precision-Recall Curve): {auprc:.4f}")

# Random Forest
def RF(xTrain, xTest, yTrain, yTest, dataset, target_names, max_depth, max_feature, min_samples_leaf, n_estimators):
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_feature, min_samples_leaf=min_samples_leaf, random_state=42)
    rf.fit(xTrain, yTrain)
    y_pred_rf = rf.predict(xTest)
    y_pred_prob_rf = rf.predict_proba(xTest)[:, 1]  # Probabilities for the positive class

    # Print classification report
    print(f"Random Forest on {dataset} dataset:")
    print(classification_report(yTest, y_pred_rf, target_names=target_names))

    # Calculate precision, recall, and thresholds for AUPRC
    precision, recall, _ = precision_recall_curve(yTest, y_pred_prob_rf)

    # Calculate AUPRC
    auprc = auc(recall, precision)
    print(f"AUPRC (Area Under Precision-Recall Curve): {auprc:.4f}")

In [None]:
# Binary Dataset
binary_df_train = pd.read_csv('binary_df_train.csv').head(100000)
binary_df_test = pd.read_csv('binary_df_test.csv').head(100000)

MNB(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names)
BNB(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names)
PERC(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names, 0.01, 10, None, 0.01)
LR(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names, None, 0.001, 'sag', 30)
DT(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names, 50, 30, 20)
RF(binary_df_train, binary_df_test, yTrain, yTest, 'binary', target_names, None, 'sqrt', 4, 100)

In [None]:
# Bag of Words Dataset
bow_df_train = pd.read_csv('bow_df_train.csv').head(100000)
bow_df_test = pd.read_csv('bow_df_test.csv').head(100000)

MNB(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names)
BNB(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names)
PERC(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names, 1, 10, None, 0.001)
LR(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names, 'l2', 0.001, 'liblinear', 10)
DT(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names, 30, 20, 20)
RF(bow_df_train, bow_df_test, yTrain, yTest, 'bow', target_names, None, 'log2', 2, 100)

In [None]:
# Tfidf Dataset
tfidf_df_train = pd.read_csv('tfidf_df_train.csv').head(100000)
tfidf_df_test = pd.read_csv('tfidf_df_test.csv').head(100000)

MNB(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names)
BNB(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names)
PERC(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names, 0.00001, 10, None, 0.01)
LR(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names, None, 1e-06, 'sag', 10)
DT(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names, 50, 20, 20)
RF(tfidf_df_train, tfidf_df_test, yTrain, yTest, 'tfidf', target_names, None, 'sqrt', 4, 100)

In [None]:
# Hash Dataset
hash_df_train = pd.read_csv('hash_df_train.csv').head(100000)
hash_df_test = pd.read_csv('hash_df_test.csv').head(100000)

BNB(hash_df_train, hash_df_test, yTrain, yTest, 'hashing', target_names)
PERC(hash_df_train, hash_df_test, yTrain, yTest, 'hashing', target_names, 0.01, 20, 'l2', 0.000001)
LR(hash_df_train, hash_df_test, yTrain, yTest, 'hashing', target_names, None, 0.001, 'saga', 10)
DT(hash_df_train, hash_df_test, yTrain, yTest, 'hashing', target_names, 30, 10, 20)
RF(hash_df_train, hash_df_test, yTrain, yTest, 'hashing', target_names, None, 'sqrt', 4, 100)

In [None]:
# LDA Dataset
lda_df_train = pd.read_csv('lda_df_train.csv').head(100000)
lda_df_test = pd.read_csv('lda_df_test.csv').head(100000)

MNB(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names)
BNB(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names)
KNN(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names, 7, 'manhattan')
PERC(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names, 0.01, 10, 'l2', 0.01)
LR(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names, None, 1e-05, 'sag', 10)
DT(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names, 50, 10, 10)
RF(lda_df_train, lda_df_test, yTrain, yTest, 'LDA', target_names, 5, None, 4, 100)

In [None]:
# PCA Dataset
pca_df_train = pd.read_csv('pca_df_train.csv').head(100000)
pca_df_test = pd.read_csv('pca_df_test.csv').head(100000)

BNB(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names)
KNN(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names, 13, 'euclidean')
PERC(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names, 1, 10, 'elasticnet', 0.01)
LR(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names, None, 1e-06, 'sag', 10)
DT(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names, 10, 30, 20)
RF(pca_df_train, pca_df_test, yTrain, yTest, 'PCA', target_names, None, 'sqrt', 1, 50)