In [10]:
# Dataframes
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Splitting data
from sklearn.model_selection import train_test_split

# Text Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Reports
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [11]:
# Import data
df = pd.read_csv('input_new_categories.csv')

# Drop nulls
df = df.dropna()

# Clarify independent and dependent variables
x = df['text']
y = df['type']

# Split datasets into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y,
                                                test_size=0.2, random_state=0)

In [12]:
# Specify vectorizer to be used
vectorizer = TfidfVectorizer()

In [13]:
# Create vectorized versions of relevant datasets
vector_xtrain = vectorizer.fit_transform(xtrain)
vector_xtest = vectorizer.transform(xtest)

In [14]:
# Create a "labels" list
labels = ['database','database_admin','deploy','elasticsearch','fastly','redis','resources','sendgrid','upgrade_services','upsize']

In [15]:
from sklearn.linear_model import LogisticRegression

# Define the model for Multinomial Logistic Regression with minimized regularization
mlr = LogisticRegression(multi_class = 'multinomial', penalty = 'l2', C = 1e42)

# Fit the model
mlr.fit(vector_xtrain, ytrain)

# Make predictions
preds = mlr.predict(vector_xtest)

# Generate classification report
print('Multinomial Logistic Regression, RS=0')
print(classification_report(ytest, preds))

Multinomial Logistic Regression, RS=0
                  precision    recall  f1-score   support

        database       0.97      0.97      0.97       353
  database_admin       0.98      0.98      0.98       215
          deploy       0.96      0.99      0.97       606
   elasticsearch       0.99      0.99      0.99       434
          fastly       0.99      1.00      0.99       500
           redis       0.97      0.95      0.96       405
       resources       0.99      0.99      0.99       385
        sendgrid       0.97      0.97      0.97       183
upgrade_services       0.98      0.95      0.96       272
          upsize       1.00      1.00      1.00       432

        accuracy                           0.98      3785
       macro avg       0.98      0.98      0.98      3785
    weighted avg       0.98      0.98      0.98      3785



In [16]:
# Get feature names from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Get coefficients from the model
coefficients = mlr.coef_

# Calculate the square sum of coefficients for each feature across all classes
squared_coefficients = np.square(coefficients).sum(axis=0)

# Create a Series for easier sorting and indexing
sorted_features_l2 = pd.Series(squared_coefficients, index=feature_names).sort_values(ascending=False)

# Get info on resulting object
print(sorted_features_l2.info())
sorted_features_l2.head()

<class 'pandas.core.series.Series'>
Index: 7498 entries, sql to wig
Series name: None
Non-Null Count  Dtype  
--------------  -----  
7498 non-null   float64
dtypes: float64(1)
memory usage: 117.2+ KB
None


sql         2.650664e+06
fastly      2.313035e+06
upgrade     2.090904e+06
database    1.981088e+06
upsize      1.973084e+06
dtype: float64

In [17]:
# Now, let's save this Series to a CSV file
sorted_features_l2.to_csv('mlr_feature_importance_new_categories.csv', header=True)