In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('train_t5.csv')

In [3]:
data.head()

Unnamed: 0,text,label
0,Does anyone use their own computer question as...,joy
1,WFH is getting to be...ehhhh. thats not even a...,joy
2,everything is awesome …Are annual employee sat...,joy
3,Remote Workers Beware: US Entrepreneur Warns ...,fear
4,Teams & Slack Users: Please just ask the quest...,sadness


In [4]:
label_counts = data['label'].value_counts()
label_counts

label
joy         43463
anger       29667
fear        12054
sadness     11217
surprise     1008
love          487
Name: count, dtype: int64

In [5]:
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download

In [6]:
# Download necessary NLTK resources
download('punkt')  # For tokenization
download('stopwords')  # For stopwords

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to C:\Users\minhd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\minhd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def preprocess_text(text):
    # lowercase 
    text = text.lower()

    tokens = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # lemmatization (using SpaCy)
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]

    # return to string 
    return " ".join(lemmatized_tokens)

In [8]:
data['processed_text'] = data['text'].apply(preprocess_text)

In [9]:
data

Unnamed: 0,text,label,processed_text
0,Does anyone use their own computer question as...,joy,anyone use computer question remote software e...
1,WFH is getting to be...ehhhh. thats not even a...,joy,wfh get ehhhh that s even downside good part s...
2,everything is awesome …Are annual employee sat...,joy,everything awesome annual employee satisfactio...
3,Remote Workers Beware: US Entrepreneur Warns ...,fear,remote worker beware we entrepreneur warn work...
4,Teams & Slack Users: Please just ask the quest...,sadness,team slack user please ask question stop leave...
...,...,...,...
97891,money20k for 2 days per week in office or sta...,joy,day per week office stay remote question way s...
97892,RTO and Environmemtalism i'm even game for reg...,anger,rto environmemtalism even game regional requir...
97893,What purchase have you made that’s made a big ...,fear,purchase make make big impact wfh environment ...
97894,Why are people obsessed with return to office ...,love,people obsess return office question question ...


In [10]:
test_data = pd.read_csv('test_t5.csv')

In [11]:
test_data['processed_text'] = test_data['text'].apply(preprocess_text)

In [12]:
test_data

Unnamed: 0,text,label,processed_text
0,Winter Blues and WFH question i think the sudd...,anger,winter blue wfh question think sudden shift bu...
1,New Workspace i saw some of your other posts a...,joy,new workspace see post renovation go incredibl...
2,Hard to mentally unwind… i go for a long walk ...,joy,hard mentally go long walk work help shift wor...
3,Would you leave 150k for 90k depends on your e...,joy,would leave depend expense saving
4,There’s no magic formula to get remote work so...,fear,magic formula get remote work worry nothing go...
...,...,...,...
24469,My biggest gripe is people who just message yo...,anger,big gripe people message team say hi question ...
24470,WFH exception denied caregiving that requires ...,anger,wfh exception deny caregiving require home kin...
24471,Would you take money15k less/year for WFH que...,joy,would take wfh question yes grad school put fo...
24472,“Losers question” yeah frey has been a shithea...,anger,loser question yeah frey shithead long time late


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

X_train = data['processed_text']
y_train = data['label']

X_test = test_data['processed_text']
y_test = test_data['label']

In [14]:
import time
from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer

start_time = time.time()
# vect = CountVectorizer()  # Convert a collection of text documents to a matrix of token counts.
vect = CountVectorizer(min_df=5)  # Convert a collection of text documents to a matrix of token counts.

print('fit')
vect.fit(X_train)

print('transform') # actual counting
X_train_tok = vect.transform(X_train)

end_time = time.time()
print('done')

# The two steps above can be condensed in a single step that processes train
# data only once.

X_test_tok = vect.transform(X_test)

fit
transform
done


In [15]:
elapsed_time = end_time - start_time
print(f"Time employed: {elapsed_time:.2f} seconds")
len(vect.vocabulary_)

Time employed: 4.49 seconds


9760

In [16]:
vect.vocabulary_

{'anyone': 395,
 'use': 9190,
 'computer': 1701,
 'question': 6833,
 'remote': 7105,
 'software': 7951,
 'engineer': 2865,
 'absolutely': 23,
 'believe': 803,
 'invest': 4566,
 'ideal': 4214,
 'hardware': 3889,
 'office': 5911,
 'since': 7789,
 'company': 1651,
 'issue': 4612,
 'laptop': 4837,
 'still': 8205,
 'desk': 2306,
 'can': 1214,
 'imagine': 4253,
 'work': 9621,
 'wfh': 9501,
 'get': 3642,
 'ehhhh': 2766,
 'that': 8621,
 'even': 2978,
 'downside': 2596,
 'good': 3706,
 'part': 6174,
 'see': 7574,
 'people': 6263,
 'bad': 671,
 'interact': 4506,
 'do': 2530,
 'not': 5807,
 'care': 1253,
 'everything': 2989,
 'awesome': 632,
 'annual': 370,
 'employee': 2827,
 'satisfaction': 7457,
 'survey': 8401,
 'really': 6938,
 'anonymous': 375,
 'move': 5580,
 'internally': 4523,
 'find': 3258,
 'worker': 9627,
 'beware': 830,
 'we': 9451,
 'entrepreneur': 2899,
 'warn': 9421,
 'philippines': 6331,
 'latin': 4853,
 'america': 323,
 'well': 9483,
 'job': 4671,
 'probably': 6657,
 'willing': 

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2  # Import SelectKBest and chi2
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.svm import LinearSVC

opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform  

budget = 10

r_param_grid = [
    {'sel__k': [200, 1000, 2000, 3000, 4000, 'all'], 'learner__C': loguniform(0.01, 100)},  
]

optr_search = RandomizedSearchCV(opt_pipeline, r_param_grid, cv=5, n_iter=budget, n_jobs=3, verbose=True).fit(X_train_tok, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [19]:
optr_search.best_params_

{'learner__C': 11.356982605508229, 'sel__k': 4000}

In [20]:
X_test_tok

<24474x9760 sparse matrix of type '<class 'numpy.int64'>'
	with 463526 stored elements in Compressed Sparse Row format>

In [21]:
optr_predictions = optr_search.best_estimator_.predict(X_test_tok)

from sklearn.metrics import classification_report, accuracy_score

print(f"Accuracy: {accuracy_score(y_test, optr_predictions)}")

print("Classification Report:")
print(classification_report(y_test, optr_predictions))

Accuracy: 0.7365367328593609
Classification Report:
              precision    recall  f1-score   support

       anger       0.71      0.71      0.71      7417
        fear       0.66      0.51      0.57      3014
         joy       0.77      0.85      0.81     10865
        love       0.40      0.22      0.28       122
     sadness       0.74      0.66      0.70      2804
    surprise       0.67      0.63      0.65       252

    accuracy                           0.74     24474
   macro avg       0.66      0.60      0.62     24474
weighted avg       0.73      0.74      0.73     24474



## Non-linear SVM

In [22]:
from sklearn.svm import SVC

# create the pipeline with SVM and linear kernel
opt_pipeline_linear = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', SVC(kernel='linear'))  
])

# define the parameter grid for RandomizedSearchCV
r_param_grid_linear = [
    {'sel__k': [2000, 3000], 
     'learner__C': loguniform(0.01, 100),  # C parameter of the SVM
     'learner__gamma': ['scale', 'auto']}  
]

In [23]:
budget = 2  # Number of iterations
optr_search_linear= RandomizedSearchCV(opt_pipeline_linear, r_param_grid_linear, cv=2, n_iter=budget, n_jobs=-1, verbose=True)
optr_search_linear.fit(X_train_tok, y_train)

best_model_linear = optr_search_linear.best_estimator_

Fitting 2 folds for each of 2 candidates, totalling 4 fits


In [24]:
optr_search_linear.best_params_

{'learner__C': 4.4287938947563505, 'learner__gamma': 'scale', 'sel__k': 3000}

In [25]:
y_pred_linear = best_model_linear.predict(X_test_tok)
print(f"Accuracy: {accuracy_score(y_test, y_pred_linear)}")

print("Classification Report:")
print(classification_report(y_test, y_pred_linear))

Accuracy: 0.736414153795865
Classification Report:
              precision    recall  f1-score   support

       anger       0.69      0.72      0.71      7417
        fear       0.65      0.51      0.57      3014
         joy       0.78      0.83      0.81     10865
        love       0.58      0.24      0.34       122
     sadness       0.75      0.68      0.71      2804
    surprise       0.69      0.63      0.66       252

    accuracy                           0.74     24474
   macro avg       0.69      0.60      0.63     24474
weighted avg       0.73      0.74      0.73     24474

