In [1]:
# Dataframes
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Splitting data
from sklearn.model_selection import train_test_split

# Text Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Reports
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# Import data
df = pd.read_excel('input_og_categories.xlsx')

# Drop nulls
df = df.dropna()

# Clarify independent and dependent variables
x = df['text']
y = df['type']

# Split datasets into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y,
                                                test_size=0.2, random_state=0)


In [3]:
# Specify vectorizer to be used
vectorizer = TfidfVectorizer()

In [4]:
# Create vectorized versions of relevant datasets
vector_xtrain = vectorizer.fit_transform(xtrain)
vector_xtest = vectorizer.transform(xtest)

In [5]:
# Create a "labels" list
labels = ['database','deploy','elasticsearch','fastly','redis','resources','sendgrid','upgrade_services','upsize']

In [6]:
from sklearn.linear_model import LogisticRegression

# Define the model for Multinomial Logistic Regression with minimized regularization
mlr = LogisticRegression(multi_class = 'multinomial', penalty = 'l2', C = 1e42)

# Fit the model
mlr.fit(vector_xtrain, ytrain)

# Make predictions
preds = mlr.predict(vector_xtest)

# Generate classification report
print('Multinomial Logistic Regression, RS=0')
print(classification_report(ytest, preds))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Multinomial Logistic Regression, RS=0
                  precision    recall  f1-score   support

        database       0.85      0.82      0.83       525
          deploy       0.96      0.96      0.96       901
   elasticsearch       0.79      0.73      0.76       268
          fastly       0.98      0.99      0.99       552
           redis       0.85      0.88      0.87       173
       resources       0.78      0.82      0.80       379
        sendgrid       0.98      0.94      0.96       137
upgrade_services       0.83      0.86      0.84       355
          upsize       0.96      0.95      0.96       495

        accuracy                           0.90      3785
       macro avg       0.89      0.88      0.89      3785
    weighted avg       0.90      0.90      0.90      3785



In [8]:
# Get feature names from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Get coefficients from the model
coefficients = mlr.coef_

# Calculate the square sum of coefficients for each feature across all classes
squared_coefficients = np.square(coefficients).sum(axis=0)

# Create a Series for easier sorting and indexing
sorted_features_l2 = pd.Series(squared_coefficients, index=feature_names).sort_values(ascending=False)

# Get info on resulting object
print(sorted_features_l2.info())
sorted_features_l2.head()

<class 'pandas.core.series.Series'>
Index: 7498 entries, deploy to surcharge
Series name: None
Non-Null Count  Dtype  
--------------  -----  
7498 non-null   float64
dtypes: float64(1)
memory usage: 117.2+ KB
None


deploy      19629.111046
database     8863.570090
fastly       7595.302286
search       6151.625940
mail         5862.788412
dtype: float64

In [9]:
# Now, let's save this Series to a CSV file
sorted_features_l2.to_csv('mlr_feature_importance_og_categories.csv', header=True)