In [1]:
import gensim
import nltk
import numpy as np
import os
import pandas as pd
import random
from nltk.corpus import stopwords
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

nltk.download("stopwords")
os.chdir("../")
random.seed(42)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wk3user3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load data

In [2]:
train_df = pd.read_csv("data/processed/split/counsel-chat-best-answer-train.csv")
train_df.head()

Unnamed: 0,questionID,questionTitle,questionText,topic,answerText,upvotes,views
0,1,My apartment manager won't let me keep an emot...,I have been diagnosed with general anxiety and...,depression,"This can be a difficult situation. Typically,...",2,1026
1,3,Why do I feel like I don't belong anywhere?,There are many people willing to lovingly prov...,depression,I truly understand what you are saying. I want...,1,62
2,4,How can I help my girlfriend?,My girlfriend just quit drinking and she becam...,depression,You're probably not going to like my answer.Yo...,3,824
3,10,How do I stop feeling empty?,I don't know how else to explain it. All I can...,depression,Why do I feel empty?Feelings of emptiness—a la...,3,148
4,12,How can I get my husband to listen to my needs...,"I tried telling my husband I was depressed, an...",depression,"Oh dear.From what you write, your husband does...",1,240


In [3]:
topics = [
    topic 
    for topic, count in train_df.groupby("topic").count()["questionText"].to_dict().items()
    if count > 20
]
topics

['anxiety',
 'depression',
 'family-conflict',
 'intimacy',
 'parenting',
 'relationships',
 'self-esteem']

In [4]:
test_df = pd.read_csv("data/processed/split/counsel-chat-best-answer-test.csv")
test_df.head()

Unnamed: 0,questionID,questionTitle,questionText,topic,answerText,upvotes,views
0,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,It is very common for people to have multiple ...,3,1971
1,2,I feel like my mother doesn't support me,My mother is combative with me when I say I do...,depression,Do you live with your mom and have constant in...,2,187
2,11,Why am I experiencing dfficulty maintaining an...,A few years ago I was making love to my wife w...,depression,When I'm working with men with this type of si...,1,194
3,14,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,"Hello, and thank you for your question and see...",2,5423
4,16,How can I stop treating people badly?,I feel like I'm so alone. I treat people horri...,depression,Give yourself a little more credit for self-ob...,1,1029


### Process data

In [5]:
train_df["text"] = train_df["questionText"].apply(lambda x: list(filter(
    lambda word: word not in stopwords.words("english"), gensim.utils.simple_preprocess(x)
)))
train_df[["questionText", "text", "topic"]].head()

Unnamed: 0,questionText,text,topic
0,I have been diagnosed with general anxiety and...,"[diagnosed, general, anxiety, depression, fami...",depression
1,There are many people willing to lovingly prov...,"[many, people, willing, lovingly, provide, hom...",depression
2,My girlfriend just quit drinking and she becam...,"[girlfriend, quit, drinking, became, really, d...",depression
3,I don't know how else to explain it. All I can...,"[know, else, explain, say, feel, empty, feel, ...",depression
4,"I tried telling my husband I was depressed, an...","[tried, telling, husband, depressed, ignored, ...",depression


In [6]:
test_df["text"] = test_df["questionText"].apply(lambda x: list(filter(
    lambda word: word not in stopwords.words("english"), gensim.utils.simple_preprocess(x)
)))
test_df[["questionText", "text", "topic"]].head()

Unnamed: 0,questionText,text,topic
0,I have so many issues to address. I have a his...,"[many, issues, address, history, sexual, abuse...",depression
1,My mother is combative with me when I say I do...,"[mother, combative, say, want, talk, depressio...",depression
2,A few years ago I was making love to my wife w...,"[years, ago, making, love, wife, known, reason...",depression
3,I'm going through some things with my feelings...,"[going, things, feelings, barely, sleep, nothi...",depression
4,I feel like I'm so alone. I treat people horri...,"[feel, like, alone, treat, people, horribly, b...",depression


### Word to Vec

In [7]:
word2vec = gensim.models.Word2Vec(
    train_df.text, vector_size=100, window=5, min_count=2
)
words = word2vec.wv.index_to_key
print(len(words))
words[:10]

1257


['feel',
 'like',
 'know',
 'get',
 'want',
 'time',
 'years',
 'boyfriend',
 'really',
 'always']

In [8]:
word2vec.wv.most_similar('family')

[('like', 0.9962604641914368),
 ('get', 0.9955736398696899),
 ('people', 0.9952930808067322),
 ('feel', 0.9952247142791748),
 ('always', 0.9951643347740173),
 ('love', 0.9951603412628174),
 ('years', 0.9951544404029846),
 ('would', 0.9950367212295532),
 ('time', 0.995022177696228),
 ('even', 0.9949990510940552)]

In [9]:
sum(train_df.text.apply(len) > 20)

233

In [10]:
train_vec = [np.array(
    [word2vec.wv[word] for word in text if word in words[:]]
) for text in train_df.text]
print([vec.shape for vec in train_vec[:10]])

[(16, 100), (21, 100), (12, 100), (11, 100), (22, 100), (6, 100), (22, 100), (29, 100), (4, 100), (51, 100)]


In [11]:
train_vec = np.array(
    [np.mean(vec, axis=0) for vec in train_vec]
)
print(train_vec.shape)

(445, 100)


In [12]:
test_vec = [np.array(
    [word2vec.wv[word] for word in text if word in words[:]]
) for text in test_df.text]
print([vec.shape for vec in test_vec[:10]])

[(20, 100), (21, 100), (31, 100), (21, 100), (21, 100), (13, 100), (19, 100), (30, 100), (11, 100), (12, 100)]


In [13]:
test_vec = np.array(
    [np.mean(vec, axis=0) for vec in test_vec]
)
print(test_vec.shape)

(112, 100)


### XG Boost

In [14]:
topics = ["others"] + topics
train_labels = np.array([
    topics.index(topic) if topic in topics else 0
    for topic in train_df["topic"] 
])
test_labels = np.array([
    topics.index(topic) if topic in topics else 0
    for topic in test_df["topic"] 
])

In [15]:
xgb_classifier = XGBClassifier(
    objective='multi:softmax',
    random_state=42
)
xgb_classifier.fit(train_vec, train_labels)

In [16]:
test_preds = xgb_classifier.predict(test_vec)
test_acc = np.mean(test_preds == test_labels)
print(f"Accuracy: {test_acc}")

Accuracy: 0.24107142857142858


### NB Classifier

In [21]:
metrics = {}
for t in range(len(topics)):
    train_topics = [1 if topic == t else 0 for topic in train_labels]
    train_weights = [train_topics.count(0) if topic == 1 else train_topics.count(1) for topic in train_topics]
    gnb = GaussianNB()
    gnb.fit(train_vec, train_topics, sample_weight=train_weights)
    test_topics = [1 if topic == t else 0 for topic in test_labels]
    test_preds = gnb.predict(test_vec)
    metrics[t] = {
        "accuracy": accuracy_score(test_topics, test_preds),
        "precision": precision_score(test_topics, test_preds),
        "recall": recall_score(test_topics, test_preds),
        "f1": f1_score(test_topics, test_preds)
    }

In [22]:
pd.DataFrame.from_dict(metrics, orient="index")

Unnamed: 0,accuracy,precision,recall,f1
0,0.5625,0.323529,0.297297,0.309859
1,0.723214,0.16,0.285714,0.205128
2,0.357143,0.168831,0.619048,0.265306
3,0.526786,0.057692,0.428571,0.101695
4,0.410714,0.102941,0.583333,0.175
5,0.642857,0.057143,0.222222,0.090909
6,0.357143,0.092105,0.7,0.162791
7,0.348214,0.026667,1.0,0.051948
