<div style="background-color:white; text-align:center; font-family:Arial, Helvetica, sans-serif; padding:50px;">
  <!-- Tytuł -->
  <div style="color:#993520; font-size:60px; font-weight:bold; margin-bottom:20px;">
    SENTIMENT ANALYSIS
  </div>

  <!-- Podtytuł -->
  <div style="color:#993520; font-size:35px; margin-bottom:40px;">
    Supervised sentiment analysis: feature extraction & selection
  </div>

  <!-- Autor -->
  <div style="color:black; font-size:30px; margin-bottom:10px;">
    Maciej Świtała, PhD
  </div>

  <!-- Data / semestr -->
  <div style="color:black; font-size:30px; margin-bottom:50px;">
    Autumn 2025
  </div>

  <!-- Logo -->
  <div>
    <img src="img/wne-logo-new-en.jpg" alt="WNE Logo" style="max-width:400px; height:auto;">
  </div>
</div>


### 1. Intro

In [1]:
# !pip install pandas numpy matplotlib nltk scikit-learn

In [2]:
import pandas as pd  # for working with data in DataFrames
import numpy as np  # numerical operations and arrays

import matplotlib.pyplot as plt  # data visualization

import pickle  # data loading
import math  # mathematical functions
import ast

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

from collections import Counter  # counting occurrences of elements
from itertools import islice

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer # VADER algorithm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer, mean_squared_error

In [3]:
# let us load an example dataset (already initially preprocessed); these are opinions on an individual McDonald's restaurant in the U.S.
data = pd.read_csv("data/nichecom-opinions-mcdonalds_cleaned.txt", sep="\t", encoding="utf-8")

In [4]:
data.head()

Unnamed: 0,Review,Rating,Position,Date,Review_cleaned
0,working for McDonald's is very unique you lear...,Rating 4 out of 5,Senior Employee,2024-09-30,"['working', 'mcdonalds', 'unique', 'learn', 'l..."
1,It have been great so far. The people are real...,Rating 3 out of 5,Entry Level Employee,2024-09-12,"['great', 'far', 'people', 'really', 'kind', '..."
2,The Mcdonalds that I work at is a very good jo...,Rating 3 out of 5,Manager / Director,2024-09-09,"['mcdonalds', 'work', 'good', 'job', 'problem'..."
3,While the workplace environment may not be per...,Rating 2 out of 5,Intern / Student Worker,2024-06-10,"['workplace', 'environment', 'may', 'perfect',..."
4,The overall experience of working at a McDonal...,Rating 3 out of 5,Other,2024-05-31,"['overall', 'experience', 'working', 'mcdonald..."


In [5]:
data.shape

(472, 5)

In [6]:
# let us simulate that we do have a continuous dependent variable resembling sentiment
# for educational purposes it can be a compound score obtained from VADER algorithm
def vader(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores['compound']

data['y_continuous'] = data['Review'].apply(vader)

# another option would be considering a binary classification with 1 for, e.g., highest sentiment score
# and 0 otherwise; for educational purposes let us create such a target variable
data['y_binary'] = (data['Rating']=='Rating 5 out of 5').astype(int)

# however, in our example it would be most reasonable to consider multiclassification problem with target:
data['y_multiclass'] = pd.to_numeric(data['Rating'].map(lambda x: x.split('Rating ')[1].split(' out of ')[0]))

# we will proceed further, preparing features for all three possible target variables

### 2. Data preprocessing

In [7]:
# this function generates n-grams for a list of tokens over a specified range of n values
def generate_ngrams(tokens, ngram_range=(1,3)):

    all_ngrams = []  # this list will store all generated n-grams

    # loop over each n in the range (e.g., 1, 2, 3)
    for n in range(ngram_range[0], ngram_range[1]+1):
        # create sliding windows of length n
        ngrams = zip(*[tokens[i:] for i in range(n)])  # e.g., for bigrams: [tokens[0:], tokens[1:]]
        
        # join tokens with underscores to form n-gram strings
        all_ngrams.extend(['_'.join(gram) for gram in ngrams])
    
    return all_ngrams  # return the complete list of n-grams

# apply n-gram generation to all cleaned texts
# texts_cleaned is a list of documents, each a list of tokens
texts_with_ngrams = [generate_ngrams(doc, ngram_range=(1,3)) for doc in data['Review_cleaned'].apply(ast.literal_eval)]

# after this step:
# - texts_with_ngrams[i] contains all unigrams, bigrams, and trigrams for document i
# - each n-gram is a single string (e.g., "new_york", "war_in_ukraine")

# show a sample of cleaned texts
# print(texts_with_ngrams[0])

# count unique tokens after incorporation of n-grams
freq_dict = Counter(token for doc in texts_with_ngrams for token in np.unique(doc))
# print('there are', len(freq_dict), 'unique tokens')

### 3. Train-test split

In [8]:
# "stratified" split for continuous target variable
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    texts_with_ngrams, data['y_continuous'], test_size=0.2, random_state=42
)

print('n. obs. in train:',len(X_train1))
print('n. obs. in test:',len(X_test1))

print('\ny_train:\n',y_train1.describe(),sep='')
print('\ny_test:\n',y_test1.describe(),sep='')

n. obs. in train: 377
n. obs. in test: 95

y_train:
count    377.000000
mean       0.537701
std        0.535235
min       -0.978200
25%        0.361200
50%        0.811500
75%        0.900100
max        0.987800
Name: y_continuous, dtype: float64

y_test:
count    95.000000
mean      0.499582
std       0.560678
min      -0.920600
25%       0.366400
50%       0.765000
75%       0.892700
max       0.968200
Name: y_continuous, dtype: float64


In [9]:
# stratified split for binary target variable
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    texts_with_ngrams, data['y_binary'], test_size=0.2, random_state=42, stratify=data['y_binary']
)

print('n. obs. in train:',len(X_train2))
print('n. obs. in test:',len(X_test2))
print('\n')
print('1s % in y_train:',np.round((y_train2==1).sum()/len(y_train2),4))
print('1s % in y_test:',np.round((y_test2==1).sum()/len(y_test2),4))

n. obs. in train: 377
n. obs. in test: 95


1s % in y_train: 0.2838
1s % in y_test: 0.2842


In [10]:
# stratified split for multiclass target variable
X_train3, X_test3, y_train3, y_test3 = train_test_split(
    texts_with_ngrams, data['y_multiclass'], test_size=0.2, random_state=42, stratify=data['y_multiclass']
)

print('n. obs. in train:',len(X_train3))
print('n. obs. in test:',len(X_test3))

print('\n',np.round(y_train3.value_counts(normalize=True), 4))
print('\n',np.round(y_test3.value_counts(normalize=True), 4))

n. obs. in train: 377
n. obs. in test: 95

 y_multiclass
4    0.3024
3    0.2997
5    0.2838
2    0.0690
1    0.0451
Name: proportion, dtype: float64

 y_multiclass
4    0.3053
3    0.2947
5    0.2842
2    0.0737
1    0.0421
Name: proportion, dtype: float64


### 4. Feature extraction

In [12]:
# this function computes TF, DF, IDF, TF-IDF for all the tokens under consideration

def compute_tfidf(texts):

    N = len(texts)  # number of documents
    
    # 1. Compute TF for each document
    tf_docs = []
    for doc in texts:
        counts = Counter(doc)
        total_words = len(doc)
        tf_doc = {word: count / total_words for word, count in counts.items()}
        tf_docs.append(tf_doc)
    
    # 2. Compute DF for each token (fraction of docs containing the token)
    df_dict = {}
    for doc in texts:
        for token in set(doc):
            df_dict[token] = df_dict.get(token, 0) + 1
    df_dict = {token: count / N for token, count in df_dict.items()}
    
    # 3. Compute IDF
    idf_dict = {token: math.log(1 / df) for token, df in df_dict.items()}
    
    # 4. Compute TF-IDF for each document
    tfidf_docs = []
    for tf_doc in tf_docs:
        tfidf_doc = {token: tf_val * idf_dict[token] for token, tf_val in tf_doc.items()}
        tfidf_docs.append(tfidf_doc)
    
    return tf_docs, df_dict, idf_dict, tfidf_docs

tf_docs1, df_dict1, idf_dict1, tfidf_docs1 = compute_tfidf(X_train1)
tf_docs2, df_dict2, idf_dict2, tfidf_docs2 = compute_tfidf(X_train2)
tf_docs3, df_dict3, idf_dict3, tfidf_docs3 = compute_tfidf(X_train3)

# usually we are interested in DF, primarily due to its intuitive interpretation
# initial analysis of DF is the first step towards filtering out the too frequent and too rare tokens

In [13]:
# filtering both subsets with respect to statistics obtained for the training subsets
df_dict1_filtered = {key: value for key, value in df_dict1.items() if (value > 2/len(X_train1)) & (value <= 0.95)}
allowed_keys = set(df_dict1_filtered.keys())
X_train1_filtered = [[word for word in row if word in allowed_keys] for row in X_train1]
X_test1_filtered  = [[word for word in row if word in allowed_keys] for row in X_test1]

df_dict2_filtered = {key: value for key, value in df_dict2.items() if (value > 2/len(X_train2)) & (value <= 0.95)}
allowed_keys = set(df_dict2_filtered.keys())
X_train2_filtered = [[word for word in row if word in allowed_keys] for row in X_train2]
X_test2_filtered  = [[word for word in row if word in allowed_keys] for row in X_test2]

df_dict3_filtered = {key: value for key, value in df_dict3.items() if (value > 2/len(X_train3)) & (value <= 0.95)}
allowed_keys = set(df_dict3_filtered.keys())
X_train3_filtered = [[word for word in row if word in allowed_keys] for row in X_train3]
X_test3_filtered  = [[word for word in row if word in allowed_keys] for row in X_test3]

In [14]:
# counting tokens in documents
def count_words(doc):
    return dict(Counter(doc))

# token counts for training subset
token_counts_train_list1 = [count_words(doc) for doc in X_train1]
token_counts_train_df1 = pd.DataFrame(token_counts_train_list1).fillna(0).astype(int)
# token counts for test subset; tylko tokeny obecne w treningu
token_counts_test_list1 = []
allowed_tokens = set(token_counts_train_df1.columns)
for doc in X_test1:
    counts = count_words(doc)
    filtered_counts = {token: count for token, count in counts.items() if token in allowed_tokens}
    token_counts_test_list1.append(filtered_counts)
token_counts_test_df1 = pd.DataFrame(token_counts_test_list1).reindex(columns=token_counts_train_df1.columns, fill_value=0).fillna(0).astype(int)

# token counts for training subset
token_counts_train_list2 = [count_words(doc) for doc in X_train2]
token_counts_train_df2 = pd.DataFrame(token_counts_train_list2).fillna(0).astype(int)
# token counts for test subset; tylko tokeny obecne w treningu
token_counts_test_list2 = []
allowed_tokens = set(token_counts_train_df2.columns)
for doc in X_test2:
    counts = count_words(doc)
    filtered_counts = {token: count for token, count in counts.items() if token in allowed_tokens}
    token_counts_test_list2.append(filtered_counts)
token_counts_test_df2 = pd.DataFrame(token_counts_test_list2).reindex(columns=token_counts_train_df2.columns, fill_value=0).fillna(0).astype(int)

# token counts for training subset
token_counts_train_list3 = [count_words(doc) for doc in X_train3]
token_counts_train_df3 = pd.DataFrame(token_counts_train_list3).fillna(0).astype(int)
# token counts for test subset; tylko tokeny obecne w treningu
token_counts_test_list3 = []
allowed_tokens = set(token_counts_train_df3.columns)
for doc in X_test3:
    counts = count_words(doc)
    filtered_counts = {token: count for token, count in counts.items() if token in allowed_tokens}
    token_counts_test_list3.append(filtered_counts)
token_counts_test_df3 = pd.DataFrame(token_counts_test_list3).reindex(columns=token_counts_train_df3.columns, fill_value=0).fillna(0).astype(int)

### 5. Feature selection (hybrid, i.e., mutual information + LASSO)

In [15]:
# mutual information scores for possible features in case of continuous, binary and multiclass target variable respectively (calculation takes ~2 min.)
mutual_information_scores1 = {column: mutual_info_score(pd.cut(token_counts_train_df1[column], bins=10, labels=False), pd.cut(y_train1, bins=10, labels=False)) for column in token_counts_train_df1.columns}
mutual_information_scores2 = {column: mutual_info_score(pd.cut(token_counts_train_df2[column], bins=10, labels=False), pd.cut(y_train2, bins=10, labels=False)) for column in token_counts_train_df2.columns}
mutual_information_scores3 = {column: mutual_info_score(pd.cut(token_counts_train_df3[column], bins=10, labels=False), pd.cut(y_train3, bins=10, labels=False)) for column in token_counts_train_df3.columns}

In [16]:
# tokens with highest mutual information for continuos target variable 
mis_sorted_by_values1 = dict(sorted(mutual_information_scores1.items(), key=lambda item: item[1], reverse=True))
dict(islice(mis_sorted_by_values1.items(), 25))

{'work': 0.06852023599789789,
 'poor': 0.06538583450799125,
 'great': 0.05809834324420729,
 'manager': 0.0520763279497507,
 'working': 0.05174287752679016,
 'mcdonalds': 0.04718376837014386,
 'employee': 0.04351394327788066,
 'hour': 0.041905322853077034,
 'customer': 0.041344900876389666,
 'rude': 0.041293984446295014,
 'bit': 0.04064552968034424,
 'okay_work': 0.03987036461559835,
 'okay': 0.03863280088345439,
 'job': 0.03841647681864241,
 'good': 0.03813675637351259,
 'mcdonalds_hour': 0.03750229360685153,
 'hour_pay': 0.03750229360685153,
 'pay_bit': 0.03750229360685153,
 'bit_poor': 0.03750229360685153,
 'okay_work_mcdonalds': 0.03750229360685153,
 'work_mcdonalds_hour': 0.03750229360685153,
 'mcdonalds_hour_pay': 0.03750229360685153,
 'hour_pay_bit': 0.03750229360685153,
 'pay_bit_poor': 0.03750229360685153,
 'love': 0.03745569928914078}

In [17]:
# tokens with highest mutual information for binary target variable
mis_sorted_by_values2 = dict(sorted(mutual_information_scores2.items(), key=lambda item: item[1], reverse=True))
dict(islice(mis_sorted_by_values2.items(), 25))

{'great': 0.022113894664759947,
 'awesome': 0.017781019340707898,
 'rude': 0.01734850219884862,
 'working': 0.014684130896618679,
 'everything': 0.013667550138844468,
 'atmosphere': 0.013600605969756407,
 'employed': 0.013506858464152088,
 'family': 0.011584618180137948,
 'poor': 0.010831685916463454,
 'summer': 0.010467119803805713,
 'fast': 0.010413231552141626,
 'hour': 0.010381145232460705,
 'excellent': 0.010129606421450686,
 'mcdonalds_really': 0.010102747103829912,
 'support': 0.010102747103829912,
 'job_flexible': 0.010102747103829912,
 'one_work': 0.010102747103829912,
 'great_work': 0.010102747103829912,
 'make_job': 0.010102747103829912,
 'make': 0.010092032007154375,
 'good_first': 0.00991293022180137,
 'care': 0.009300233690564416,
 'level': 0.008997168611004237,
 'good_first_job': 0.008997168611004233,
 'really': 0.0085860200409387}

In [18]:
# tokens with highest mutual information for multiclass target variable
mis_sorted_by_values3 = dict(sorted(mutual_information_scores3.items(), key=lambda item: item[1], reverse=True))
dict(islice(mis_sorted_by_values3.items(), 25))

{'employee': 0.0325730559121349,
 'rude': 0.03210950453036741,
 'location': 0.03174701274883544,
 'work': 0.03148743209665761,
 'job': 0.026249420950348837,
 'love': 0.025900442488308154,
 'always': 0.024039298441594064,
 'issue': 0.023288552913919007,
 'family': 0.02275977802203084,
 'team': 0.022436127422461958,
 'get': 0.02219691776184784,
 'manager': 0.021593400894121637,
 'could': 0.021347419754302398,
 'customer': 0.02125308286412182,
 'college': 0.021127843129437725,
 'need': 0.021067981273059862,
 'make': 0.020756891868703595,
 'made': 0.020300429530227218,
 'yelled': 0.02020155139116499,
 'high': 0.01990024073743745,
 'place': 0.019614744079676852,
 'really': 0.01959025227053151,
 'working': 0.019564724990008345,
 'high_school': 0.01942974345556537,
 'many': 0.019343980494375083}

In [25]:
# custom transformer to select top-n features based on predefined order
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_order, top_n=25):
        self.feature_order = feature_order
        self.top_n = top_n
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        selected = list(islice(self.feature_order, self.top_n))
        return X[selected]

# predefined feature order based on importance
feature_order = list(mis_sorted_by_values1.keys())

# pipeline: feature selection -> standardization -> LASSO regression
pipeline = Pipeline([
    ("feature_select", TopFeatureSelector(feature_order=feature_order)),
    ("scale", StandardScaler(with_mean=False)),
    ("reg", Lasso(random_state=42, max_iter=5000))
])

# grid search parameters: number of features + regularization strength (alpha)
param_grid = {
    "feature_select__top_n": [25, 50, 75, 100, 125, 150],
    "reg__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

# k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# scoring function: negative RMSE (GridSearchCV maksymalizuje scoring)
rmse_scorer = make_scorer(lambda y_true, y_pred: -np.sqrt(mean_squared_error(y_true, y_pred)))

# grid search
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring=rmse_scorer,
    n_jobs=-1
)

# fit the model on the training set
grid.fit(token_counts_train_df1, y_train1)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV RMSE:", np.round(-grid.best_score_,4))

# evaluate on the test set
y_test_pred = grid.best_estimator_.predict(token_counts_test_df1)
test_rmse = np.sqrt(mean_squared_error(y_test1, y_test_pred))
print("Test RMSE:", np.round(test_rmse,4))

# the whole procedure takes ~10 s.
# results are: RMSE (train) = 0.4294, RMSE (test) = 0.5275

Best params: {'feature_select__top_n': 75, 'reg__alpha': 0.01}
Best CV RMSE: 0.4294
Test RMSE: 0.5275


In [26]:
# fitted model
best_model = grid.best_estimator_

# coefficients from multinomial lasso regression
coefficients = best_model.named_steps['reg'].coef_

# compute importance as max abs coef across classes
importance = np.abs(coefficients)

# dataframe with feature importance
df = pd.DataFrame({
    'feature': best_model.named_steps['feature_select'].feature_order[:best_model.named_steps['feature_select'].top_n],
    'importance': importance
})

In [27]:
df.sort_values(by='importance', ascending=False).head(25)

Unnamed: 0,feature,importance
1,poor,0.128944
9,rude,0.090439
27,horrible,0.089899
2,great,0.070739
51,enjoy,0.069185
61,mistake,0.068227
24,love,0.067104
72,easy,0.063245
32,friendly,0.056768
14,good,0.052437


In [28]:
# saving features with non-zero coefficients
nonzero_features = df.loc[df['importance'] > 0,]

regression_model_data = {
    "X_train": token_counts_train_df1[nonzero_features['feature']],
    "X_test": token_counts_test_df1[nonzero_features['feature']],
    "y_train": y_train1,
    "y_test": y_test1
}

# zapis do pickle
with open("outputs/regression_model_data.pkl", "wb") as f:
    pickle.dump(regression_model_data, f)

In [29]:
# custom transformer to select top-n features based on predefined order
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_order, top_n=25):
        self.feature_order = feature_order
        self.top_n = top_n
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # select the first `top_n` features from the predefined order
        selected = list(islice(self.feature_order, self.top_n))
        return X[selected]

# predefined feature order based on importance
feature_order = list(mis_sorted_by_values2.keys())

# pipeline: feature selection -> standardization -> LASSO logistic regression
pipeline = Pipeline([
    ("feature_select", TopFeatureSelector(feature_order=feature_order)),
    ("scale", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression(
        penalty="l1", 
        solver="saga", 
        random_state=42, 
        max_iter=5000
    ))
])

# grid search parameters: number of features + regularization strength
param_grid = {
    "feature_select__top_n": [25, 50, 75, 100, 125, 150], # try different numbers of top features
    "clf__C": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000, 100000] # regularization values for LASSO
}

# stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search using AUC as scoring metric
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1
)

# fit the model on the training set
grid.fit(token_counts_train_df2, y_train2)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV AUC:", np.round(grid.best_score_,4))

# evaluate on the test set
y_test_pred = grid.best_estimator_.predict_proba(token_counts_test_df2)[:, 1]
auc = roc_auc_score(y_test2, y_test_pred)
print("Test AUC:", np.round(auc,4))

# the whole procedure takes ~1.5 min.
# results are: AUC (train) = 0., AUC (test) = 0.

Best params: {'clf__C': 10000, 'feature_select__top_n': 125}
Best CV AUC: 0.8167
Test AUC: 0.6117


In [30]:
# fitted model
best_model = grid.best_estimator_

# coefficients from multinomial lasso regression
coefficients = best_model.named_steps['clf'].coef_[0]

# compute importance as max abs coef across classes
importance = np.abs(coefficients)

# dataframe with feature importance
df = pd.DataFrame({
    'feature': best_model.named_steps['feature_select'].feature_order[:best_model.named_steps['feature_select'].top_n],
    'importance': importance
})

In [31]:
df.sort_values(by='importance', ascending=False).head(25)

Unnamed: 0,feature,importance
28,either,1.24578
18,make_job,1.155716
19,make,1.090345
2,rude,1.056951
46,much,1.023651
11,hour,0.992418
74,work_long,0.848031
122,problem,0.83008
44,two,0.829507
5,atmosphere,0.818699


In [32]:
# saving features with non-zero coefficients
nonzero_features = df.loc[df['importance'] > 0,]

binary_classification_model_data = {
    "X_train": token_counts_train_df2[nonzero_features['feature']],
    "X_test": token_counts_test_df2[nonzero_features['feature']],
    "y_train": y_train2,
    "y_test": y_test2
}

# zapis do pickle
with open("outputs/binary_classification_model_data.pkl", "wb") as f:
    pickle.dump(binary_classification_model_data, f)

In [33]:
# custom transformer to select top-n features based on predefined order
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_order, top_n=25):
        self.feature_order = feature_order
        self.top_n = top_n
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        selected = list(islice(self.feature_order, self.top_n))
        return X[selected]

# predefined feature order based on importance
feature_order = list(mis_sorted_by_values3.keys())

# pipeline: feature selection -> standardization -> LASSO logistic regression (multiclass)
pipeline = Pipeline([
    ("feature_select", TopFeatureSelector(feature_order=feature_order)),
    ("scale", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="saga",
        multi_class="multinomial",
        random_state=42,
        max_iter=5000
    ))
])

# grid search parameters: number of features + regularization strength
param_grid = {
    "feature_select__top_n": [25, 50, 75, 100, 125, 150],
    "clf__C": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000, 100000]
}

# stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search using multi-class ROC AUC (one-vs-rest average weighted with class frequency)
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring="roc_auc_ovr",
    n_jobs=-1
)

# fit the model on the training set
grid.fit(token_counts_train_df3, y_train3)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV ROC AUC (OVR):", np.round(grid.best_score_,4))

# evaluate on the test set
y_test_pred_proba = grid.best_estimator_.predict_proba(token_counts_test_df3)
test_auc_ovr = roc_auc_score(y_test3, y_test_pred_proba, multi_class="ovr")
print("Test ROC AUC (OVR):", np.round(test_auc_ovr,4))

# the whole procedure takes ~ min.
# results are: AUC (train) = 0., AUC (test) = 0.



Best params: {'clf__C': 1, 'feature_select__top_n': 75}
Best CV ROC AUC (OVR): 0.7041
Test ROC AUC (OVR): 0.5469


In [34]:
# fitted model
best_model = grid.best_estimator_

# coefficients from multinomial lasso regression
coefficients = best_model.named_steps['clf'].coef_

# compute importance as max abs coef across classes
importance = np.max(np.abs(coefficients), axis=0)

# dataframe with feature importance
df = pd.DataFrame({
    'feature': best_model.named_steps['feature_select'].feature_order[:best_model.named_steps['feature_select'].top_n],
    'importance': importance
})

In [35]:
df.sort_values(by='importance', ascending=False).head(25)

Unnamed: 0,feature,importance
38,workplace,1.050524
68,willing,1.010198
35,get_yelled,0.903026
9,team,0.823881
37,outside,0.751657
51,though,0.745254
46,ever,0.727006
17,made,0.707882
5,love,0.707588
55,along,0.689464


In [36]:
# saving features with non-zero coefficients
nonzero_features = df.loc[df['importance'] > 0,]

multinomial_classification_model_data = {
    "X_train": token_counts_train_df3[nonzero_features['feature']],
    "X_test": token_counts_test_df3[nonzero_features['feature']],
    "y_train": y_train3,
    "y_test": y_test3
}

# zapis do pickle
with open("outputs/multinomial_classification_model_data.pkl", "wb") as f:
    pickle.dump(multinomial_classification_model_data, f)