In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('train_yangswei_85.csv')

In [3]:
data.head()

Unnamed: 0,text,label
0,How can i miss work if i work from home questi...,joy
1,How do you deal with being the only person in ...,sadness
2,I began my application to remote jobs yesterda...,joy
3,Anyone else question mine sit in the window si...,joy
4,What household chore did you get done today wh...,anger


In [5]:
label_counts = data['label'].value_counts()
label_counts

label
joy         53409
sadness     16033
anger       13183
fear         4621
surprise     1250
love          656
Name: count, dtype: int64

In [6]:
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download

In [7]:
# Download necessary NLTK resources
download('punkt')  # For tokenization
download('stopwords')  # For stopwords

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to C:\Users\minhd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\minhd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def preprocess_text(text):
    # lowercase 
    text = text.lower()

    tokens = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # lemmatization (using SpaCy)
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]

    # return to string 
    return " ".join(lemmatized_tokens)

In [9]:
data['processed_text'] = data['text'].apply(preprocess_text)

In [10]:
data

Unnamed: 0,text,label,processed_text
0,How can i miss work if i work from home questi...,joy,miss work work home question way would office ...
1,How do you deal with being the only person in ...,sadness,deal person social circle wfh question family ...
2,I began my application to remote jobs yesterda...,joy,begin application remote job yesterday consist...
3,Anyone else question mine sit in the window si...,joy,anyone else question mine sit window sill desk...
4,What household chore did you get done today wh...,anger,household chore get do today wfh exclamation s...
...,...,...,...
89147,"14 Best Remote Job sites, hope this helps con...",joy,well remote job site hope help consider op hav...
89148,If you speak any of the following languages an...,joy,speak follow language english fluently let kno...
89149,What do you listen to while working remote que...,fear,listen work remote question youtube video back...
89150,Finally found a WFH position exclamation excla...,joy,finally find wfh position exclamation exclamat...


In [11]:
test_data = pd.read_csv('test_yangswei_85.csv')

In [12]:
test_data['processed_text'] = test_data['text'].apply(preprocess_text)

In [13]:
test_data

Unnamed: 0,text,label,processed_text
0,RTO is the new war on the middle class don't f...,joy,rto new war middle class fire tie package aver...
1,How do you continue with life outside of work ...,joy,continue life outside work mental exhaustion c...
2,Very desperate for a job would you know a pers...,fear,desperate job would know person come apply job...
3,What time do you start working most days quest...,joy,time start work day question set hour like get...
4,What are good job sites to find LEGIT remote w...,joy,good job site find legit remote work like dece...
...,...,...,...
22284,"Its been 6 months now, all hope lost in additi...",sadness,month hope lose addition research company othe...
22285,Do I clock in office hours even though I don’t...,joy,clock office hour even though enough work ques...
22286,Would you consider moving to a small town in O...,joy,would consider move small town oregon think ex...
22287,I tried to be a WFH employee and failed misera...,sadness,try wfh employee fail miserably tip hat every ...


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

X_train = data['processed_text']
y_train = data['label']

X_test = test_data['processed_text']
y_test = test_data['label']

In [15]:
import time
from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer

start_time = time.time()
# vect = CountVectorizer()  # Convert a collection of text documents to a matrix of token counts.
vect = CountVectorizer(min_df=5)  # Convert a collection of text documents to a matrix of token counts.

print('fit')
vect.fit(X_train)

print('transform') # actual counting
X_train_tok = vect.transform(X_train)

end_time = time.time()
print('done')

# The two steps above can be condensed in a single step that processes train
# data only once.

X_test_tok = vect.transform(X_test)

fit
transform
done


In [16]:
elapsed_time = end_time - start_time
print(f"Time employed: {elapsed_time:.2f} seconds")
len(vect.vocabulary_)

Time employed: 4.25 seconds


9344

In [17]:
vect.vocabulary_

{'miss': 5210,
 'work': 9207,
 'home': 3837,
 'question': 6544,
 'way': 9035,
 'would': 9245,
 'office': 5645,
 'sick': 7435,
 'consider': 1675,
 'go': 3481,
 'sitting': 7488,
 'desk': 2174,
 'day': 2016,
 'due': 2520,
 'exhaustion': 2879,
 'say': 7153,
 'feel': 3026,
 'well': 9071,
 'need': 5424,
 'elaborate': 2617,
 'reason': 6658,
 'deal': 2034,
 'person': 6028,
 'social': 7606,
 'circle': 1364,
 'wfh': 9089,
 'family': 2972,
 'extra': 2929,
 'time': 8341,
 'give': 3456,
 'could': 1796,
 'see': 7250,
 'result': 6923,
 'isolation': 4384,
 'begin': 750,
 'application': 404,
 'remote': 6801,
 'job': 4444,
 'yesterday': 9305,
 'consistently': 1682,
 'apply': 406,
 'comment': 1529,
 'back': 621,
 'hire': 3808,
 'long': 4829,
 'take': 8118,
 'pm': 6170,
 'taxis': 8153,
 'winwin': 9167,
 'anyone': 371,
 'else': 2645,
 'mine': 5177,
 'sit': 7482,
 'window': 9159,
 'lap': 4601,
 'chair': 1260,
 'towards': 8417,
 'front': 3290,
 'half': 3635,
 'try': 8525,
 'walk': 8987,
 'across': 86,
 'keyb

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2  # Import SelectKBest and chi2
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.svm import LinearSVC

opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

In [20]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform  

budget = 10

r_param_grid = [
    {'sel__k': [200, 1000, 2000, 3000, 4000, 'all'], 'learner__C': loguniform(0.01, 100)},  
]

optr_search = RandomizedSearchCV(opt_pipeline, r_param_grid, cv=5, n_iter=budget, n_jobs=3, verbose=True).fit(X_train_tok, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [21]:
optr_search.best_params_

{'learner__C': 2.6087754300783432, 'sel__k': 3000}

In [23]:
X_test_tok

<22289x9344 sparse matrix of type '<class 'numpy.int64'>'
	with 422187 stored elements in Compressed Sparse Row format>

In [22]:
optr_predictions = optr_search.best_estimator_.predict(X_test_tok)

from sklearn.metrics import classification_report, accuracy_score

print(f"Accuracy: {accuracy_score(y_test, optr_predictions)}")

print("Classification Report:")
print(classification_report(y_test, optr_predictions))

Accuracy: 0.7771995154560546
Classification Report:
              precision    recall  f1-score   support

       anger       0.72      0.51      0.60      3296
        fear       0.67      0.39      0.49      1155
         joy       0.80      0.93      0.86     13353
        love       0.73      0.40      0.51       164
     sadness       0.76      0.63      0.69      4008
    surprise       0.76      0.68      0.72       313

    accuracy                           0.78     22289
   macro avg       0.74      0.59      0.64     22289
weighted avg       0.77      0.78      0.76     22289



## Non-linear SVM

In [25]:
from sklearn.svm import SVC

# create the pipeline with SVM and linear kernel
opt_pipeline_linear = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', SVC(kernel='linear'))  
])

# define the parameter grid for RandomizedSearchCV
r_param_grid_linear = [
    {'sel__k': [2000, 3000], 
     'learner__C': loguniform(0.01, 100),  # C parameter of the SVM
     'learner__gamma': ['scale', 'auto']}  
]

In [26]:
budget = 2  
optr_search_linear= RandomizedSearchCV(opt_pipeline_linear, r_param_grid_linear, cv=2, n_iter=budget, n_jobs=-1, verbose=True)
optr_search_linear.fit(X_train_tok, y_train)

best_model_linear = optr_search_linear.best_estimator_

Fitting 2 folds for each of 2 candidates, totalling 4 fits


In [27]:
optr_search_linear.best_params_

{'learner__C': 0.27976480806881115, 'learner__gamma': 'auto', 'sel__k': 3000}

In [28]:
y_pred_linear = best_model_linear.predict(X_test_tok)
print(f"Accuracy: {accuracy_score(y_test, y_pred_linear)}")

print("Classification Report:")
print(classification_report(y_test, y_pred_linear))

Accuracy: 0.7391538427026785
Classification Report:
              precision    recall  f1-score   support

       anger       0.72      0.37      0.49      3296
        fear       0.68      0.20      0.31      1155
         joy       0.74      0.95      0.83     13353
        love       0.64      0.26      0.37       164
     sadness       0.78      0.53      0.63      4008
    surprise       0.78      0.52      0.62       313

    accuracy                           0.74     22289
   macro avg       0.72      0.47      0.54     22289
weighted avg       0.74      0.74      0.71     22289

