# Word Count Test

In [1]:
# Data Loading
import pandas as pd
import numpy as np

# Data Splitting and Resampling
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Reports
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Model
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neural_network import MLPClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# Get valid model
# mdl = DecisionTreeClassifier()
# mdl = MLPClassifier()
# mdl = LogisticRegression(multi_class = 'multinomial')
# mdl = MultinomialNB()
# mdl = RandomForestClassifier()
mdl = LinearSVC()

# DATA PREP

In [2]:
# Read dataset
df = pd.read_csv('input_new_categories.csv')

# Drop any rows with null values
df = df.dropna()

# Clarify independent and dependent variables
x = df['text']
y = df['type']

# Split datasets into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Export ytest
ytest.to_csv('ytest.csv', index=False)

# Define vectorizer
vectorizer = TfidfVectorizer()

# Vectorize xtrain
vector_xtrain = vectorizer.fit_transform(xtrain)

# Store xtrain feature names
feats_xtr = vectorizer.get_feature_names_out()

# Vectorize xtest
vector_xtest = vectorizer.transform(xtest)

# Store xtest feature names
feats_xte = vectorizer.get_feature_names_out()

# Create dataframes of xtrain and xtest
vector_xtr = pd.DataFrame.sparse.from_spmatrix(vector_xtrain, columns = feats_xtr)
vector_xte = pd.DataFrame.sparse.from_spmatrix(vector_xtest, columns = feats_xte)

# Convert xtrain and xtest to dense values
vector_xtr = vector_xtr.sparse.to_dense().astype(np.float64)
vector_xte = vector_xte.sparse.to_dense().astype(np.float64)

# WORD COUNTS

In [3]:
# Retrieve word importance list
word_importance = pd.read_excel('word_importance_lists.xlsx')

# Create list of xtrain columns
xtrain_cols = list(vector_xtr.columns)

# Create list of xtest columns
xtest_cols = list(vector_xte.columns)

## 50 features

In [4]:
# Create list of top x words
words = list(word_importance['Word'].head(50))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [5]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 50

vector_xte columns: 7498
vector_xtest columns: 50


### NO SAMPLING ADJUSTMENT

In [6]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 141 ms
Wall time: 182 ms


In [7]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 24 ms


In [8]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('50_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.85      0.89       353
  database_admin       0.96      0.95      0.96       215
          deploy       0.88      0.97      0.92       606
   elasticsearch       0.96      0.95      0.96       434
          fastly       0.95      0.98      0.97       500
           redis       0.83      0.79      0.81       405
       resources       0.97      0.96      0.96       385
        sendgrid       0.92      0.90      0.91       183
upgrade_services       0.94      0.93      0.94       272
          upsize       0.97      0.99      0.98       432

        accuracy                           0.93      3785
       macro avg       0.93      0.93      0.93      3785
    weighted avg       0.93      0.93      0.93      3785



### OVERSAMPLING

In [9]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [10]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 312 ms
Wall time: 361 ms


In [11]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 2.99 ms


In [12]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('50_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.86      0.90       353
  database_admin       0.93      0.98      0.95       215
          deploy       0.93      0.95      0.94       606
   elasticsearch       0.96      0.94      0.95       434
          fastly       0.96      0.97      0.97       500
           redis       0.82      0.82      0.82       405
       resources       0.97      0.96      0.96       385
        sendgrid       0.89      0.94      0.91       183
upgrade_services       0.92      0.95      0.94       272
          upsize       0.98      0.98      0.98       432

        accuracy                           0.93      3785
       macro avg       0.93      0.93      0.93      3785
    weighted avg       0.93      0.93      0.93      3785



### UNDERSAMPLING

In [13]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [14]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 46.9 ms
Wall time: 83.9 ms


In [15]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 1.99 ms


In [16]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('50_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.86      0.90       353
  database_admin       0.93      0.98      0.95       215
          deploy       0.92      0.96      0.94       606
   elasticsearch       0.97      0.94      0.96       434
          fastly       0.97      0.97      0.97       500
           redis       0.83      0.82      0.83       405
       resources       0.97      0.95      0.96       385
        sendgrid       0.90      0.94      0.92       183
upgrade_services       0.92      0.94      0.93       272
          upsize       0.98      0.98      0.98       432

        accuracy                           0.93      3785
       macro avg       0.93      0.93      0.93      3785
    weighted avg       0.93      0.93      0.93      3785



## 100 features

In [17]:
# Create list of top x words
words = list(word_importance['Word'].head(100))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [18]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 100

vector_xte columns: 7498
vector_xtest columns: 100


### NO SAMPLING ADJUSTMENT

In [19]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 250 ms
Wall time: 284 ms


In [20]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 2.99 ms


In [21]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('100_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.90      0.93       353
  database_admin       0.98      0.97      0.97       215
          deploy       0.91      0.98      0.94       606
   elasticsearch       0.98      0.96      0.97       434
          fastly       0.98      1.00      0.99       500
           redis       0.93      0.88      0.90       405
       resources       0.98      0.98      0.98       385
        sendgrid       1.00      0.94      0.97       183
upgrade_services       0.95      0.95      0.95       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [22]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [23]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 500 ms
Wall time: 565 ms


In [24]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 4.99 ms


In [25]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('100_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.90      0.92       353
  database_admin       0.95      0.98      0.96       215
          deploy       0.94      0.95      0.95       606
   elasticsearch       0.98      0.95      0.96       434
          fastly       0.99      0.99      0.99       500
           redis       0.90      0.90      0.90       405
       resources       0.98      0.98      0.98       385
        sendgrid       0.95      0.97      0.96       183
upgrade_services       0.94      0.97      0.96       272
          upsize       0.99      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [26]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [27]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 93.8 ms
Wall time: 133 ms


In [28]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 2.99 ms


In [29]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('100_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.90      0.92       353
  database_admin       0.95      0.99      0.97       215
          deploy       0.93      0.96      0.95       606
   elasticsearch       0.98      0.95      0.96       434
          fastly       0.99      0.99      0.99       500
           redis       0.89      0.90      0.89       405
       resources       0.99      0.95      0.97       385
        sendgrid       0.95      0.97      0.96       183
upgrade_services       0.93      0.96      0.95       272
          upsize       0.99      0.99      0.99       432

        accuracy                           0.95      3785
       macro avg       0.95      0.96      0.95      3785
    weighted avg       0.96      0.95      0.95      3785



## 200 features

In [30]:
# Create list of top x words
words = list(word_importance['Word'].head(200))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [31]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 200

vector_xte columns: 7498
vector_xtest columns: 200


### NO SAMPLING ADJUSTMENT

In [32]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 406 ms
Wall time: 456 ms


In [33]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 4.99 ms


In [34]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('200_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.96      0.90      0.93       353
  database_admin       0.97      0.95      0.96       215
          deploy       0.90      0.99      0.94       606
   elasticsearch       0.98      0.95      0.97       434
          fastly       0.98      1.00      0.99       500
           redis       0.93      0.89      0.91       405
       resources       0.98      0.98      0.98       385
        sendgrid       0.99      0.93      0.96       183
upgrade_services       0.95      0.94      0.95       272
          upsize       0.98      0.99      0.98       432

        accuracy                           0.96      3785
       macro avg       0.96      0.95      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [35]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [36]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 594 ms
Wall time: 761 ms


In [37]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 3.99 ms


In [38]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('200_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.91      0.93       353
  database_admin       0.94      0.98      0.96       215
          deploy       0.94      0.97      0.95       606
   elasticsearch       0.99      0.94      0.96       434
          fastly       0.99      0.98      0.99       500
           redis       0.93      0.91      0.92       405
       resources       0.98      0.97      0.98       385
        sendgrid       0.95      0.97      0.96       183
upgrade_services       0.94      0.96      0.95       272
          upsize       0.98      0.99      0.98       432

        accuracy                           0.96      3785
       macro avg       0.96      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [39]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [40]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 203 ms
Wall time: 210 ms


In [41]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 8.18 ms


In [42]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('200_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.90      0.92       353
  database_admin       0.93      0.99      0.96       215
          deploy       0.93      0.98      0.95       606
   elasticsearch       0.98      0.95      0.96       434
          fastly       0.99      0.98      0.99       500
           redis       0.92      0.90      0.91       405
       resources       0.99      0.95      0.97       385
        sendgrid       0.95      0.97      0.96       183
upgrade_services       0.95      0.95      0.95       272
          upsize       0.98      0.99      0.98       432

        accuracy                           0.96      3785
       macro avg       0.96      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



## 300 features

In [43]:
# Create list of top x words
words = list(word_importance['Word'].head(301))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [44]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 300

vector_xte columns: 7498
vector_xtest columns: 300


### NO SAMPLING ADJUSTMENT

In [45]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 375 ms
Wall time: 577 ms


In [46]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 7.98 ms


In [47]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('300_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.96      0.92      0.94       353
  database_admin       0.97      0.96      0.96       215
          deploy       0.92      0.99      0.95       606
   elasticsearch       0.98      0.96      0.97       434
          fastly       0.98      0.99      0.99       500
           redis       0.96      0.90      0.93       405
       resources       0.98      0.98      0.98       385
        sendgrid       0.99      0.94      0.96       183
upgrade_services       0.95      0.95      0.95       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.97      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [48]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [49]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 844 ms
Wall time: 986 ms


In [50]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 6.98 ms


In [51]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('300_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.96      0.93      0.94       353
  database_admin       0.93      0.98      0.95       215
          deploy       0.95      0.98      0.96       606
   elasticsearch       0.99      0.94      0.96       434
          fastly       0.99      0.98      0.99       500
           redis       0.95      0.91      0.93       405
       resources       0.98      0.96      0.97       385
        sendgrid       0.95      0.97      0.96       183
upgrade_services       0.93      0.97      0.95       272
          upsize       0.98      0.99      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [52]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [53]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 234 ms
Wall time: 308 ms


In [54]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 6.98 ms


In [55]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('300_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.90      0.93       353
  database_admin       0.92      0.99      0.95       215
          deploy       0.93      0.98      0.96       606
   elasticsearch       0.98      0.94      0.96       434
          fastly       0.99      0.97      0.98       500
           redis       0.93      0.91      0.92       405
       resources       0.99      0.96      0.97       385
        sendgrid       0.93      0.97      0.95       183
upgrade_services       0.94      0.94      0.94       272
          upsize       0.98      0.99      0.99       432

        accuracy                           0.96      3785
       macro avg       0.95      0.96      0.95      3785
    weighted avg       0.96      0.96      0.96      3785



## 400 features

In [56]:
# Create list of top x words
words = list(word_importance['Word'].head(401))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [57]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 400

vector_xte columns: 7498
vector_xtest columns: 400


### NO SAMPLING ADJUSTMENT

In [58]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 500 ms
Wall time: 719 ms


In [59]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 7.98 ms


In [60]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('400_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.96      0.93      0.94       353
  database_admin       0.97      0.94      0.96       215
          deploy       0.91      0.99      0.95       606
   elasticsearch       0.98      0.95      0.97       434
          fastly       0.98      0.99      0.99       500
           redis       0.95      0.89      0.92       405
       resources       0.98      0.98      0.98       385
        sendgrid       0.98      0.96      0.97       183
upgrade_services       0.95      0.94      0.95       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.97      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [61]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [62]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 906 ms
Wall time: 1.13 s


In [63]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 8.98 ms


In [64]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('400_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.93      0.94       353
  database_admin       0.92      0.97      0.95       215
          deploy       0.94      0.98      0.96       606
   elasticsearch       0.99      0.93      0.96       434
          fastly       0.99      0.99      0.99       500
           redis       0.94      0.92      0.93       405
       resources       0.98      0.96      0.97       385
        sendgrid       0.96      0.97      0.96       183
upgrade_services       0.95      0.97      0.96       272
          upsize       0.98      0.99      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [65]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [66]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 172 ms
Wall time: 344 ms


In [67]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 7.98 ms


In [68]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('400_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.91      0.93       353
  database_admin       0.91      0.97      0.94       215
          deploy       0.93      0.97      0.95       606
   elasticsearch       0.98      0.94      0.96       434
          fastly       0.99      0.98      0.98       500
           redis       0.93      0.89      0.91       405
       resources       0.98      0.96      0.97       385
        sendgrid       0.94      0.97      0.95       183
upgrade_services       0.95      0.94      0.95       272
          upsize       0.98      0.99      0.99       432

        accuracy                           0.95      3785
       macro avg       0.95      0.95      0.95      3785
    weighted avg       0.95      0.95      0.95      3785



## 500 features

In [69]:
# Create list of top x words
words = list(word_importance['Word'].head(501))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [70]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 500

vector_xte columns: 7498
vector_xtest columns: 500


### NO SAMPLING ADJUSTMENT

In [71]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 531 ms
Wall time: 760 ms


In [72]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 9.97 ms


In [73]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('500_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.96      0.92      0.94       353
  database_admin       0.96      0.94      0.95       215
          deploy       0.92      0.99      0.95       606
   elasticsearch       0.98      0.96      0.97       434
          fastly       0.98      0.99      0.99       500
           redis       0.94      0.90      0.92       405
       resources       0.98      0.97      0.98       385
        sendgrid       0.99      0.95      0.97       183
upgrade_services       0.96      0.95      0.96       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.97      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [74]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [75]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 953 ms
Wall time: 1.21 s


In [76]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 11.8 ms


In [77]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('500_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.93      0.93      0.93       353
  database_admin       0.93      0.97      0.95       215
          deploy       0.94      0.97      0.96       606
   elasticsearch       0.98      0.94      0.96       434
          fastly       0.99      0.99      0.99       500
           redis       0.94      0.91      0.92       405
       resources       0.98      0.96      0.97       385
        sendgrid       0.96      0.96      0.96       183
upgrade_services       0.96      0.96      0.96       272
          upsize       0.97      1.00      0.98       432

        accuracy                           0.96      3785
       macro avg       0.96      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [78]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [79]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 297 ms
Wall time: 382 ms


In [80]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 10.2 ms


In [81]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('500_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.90      0.92       353
  database_admin       0.92      0.98      0.95       215
          deploy       0.93      0.98      0.95       606
   elasticsearch       0.98      0.95      0.96       434
          fastly       0.99      0.98      0.98       500
           redis       0.92      0.89      0.90       405
       resources       0.98      0.96      0.97       385
        sendgrid       0.94      0.96      0.95       183
upgrade_services       0.96      0.94      0.95       272
          upsize       0.97      1.00      0.98       432

        accuracy                           0.95      3785
       macro avg       0.95      0.95      0.95      3785
    weighted avg       0.95      0.95      0.95      3785



## 600 features

In [82]:
# Create list of top x words
words = list(word_importance['Word'].head(601))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [83]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 600

vector_xte columns: 7498
vector_xtest columns: 600


### NO SAMPLING ADJUSTMENT

In [84]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 547 ms
Wall time: 785 ms


In [85]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 10.6 ms


In [86]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('600_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.92      0.93       353
  database_admin       0.95      0.93      0.94       215
          deploy       0.92      0.99      0.95       606
   elasticsearch       0.98      0.96      0.97       434
          fastly       0.98      0.99      0.99       500
           redis       0.94      0.90      0.92       405
       resources       0.98      0.97      0.98       385
        sendgrid       0.98      0.94      0.96       183
upgrade_services       0.96      0.94      0.95       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.95      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [87]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [88]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 1.02 s
Wall time: 1.29 s


In [89]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 11 ms


In [90]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('600_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.92      0.93       353
  database_admin       0.93      0.96      0.95       215
          deploy       0.94      0.98      0.96       606
   elasticsearch       0.98      0.94      0.96       434
          fastly       0.99      0.99      0.99       500
           redis       0.93      0.92      0.93       405
       resources       0.97      0.96      0.97       385
        sendgrid       0.96      0.96      0.96       183
upgrade_services       0.96      0.95      0.96       272
          upsize       0.97      1.00      0.98       432

        accuracy                           0.96      3785
       macro avg       0.96      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [91]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [92]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 281 ms
Wall time: 399 ms


In [93]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 12 ms


In [94]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('600_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.90      0.93       353
  database_admin       0.91      0.97      0.94       215
          deploy       0.93      0.98      0.96       606
   elasticsearch       0.98      0.94      0.96       434
          fastly       0.99      0.98      0.99       500
           redis       0.92      0.90      0.91       405
       resources       0.97      0.96      0.96       385
        sendgrid       0.94      0.96      0.95       183
upgrade_services       0.95      0.94      0.95       272
          upsize       0.97      0.99      0.98       432

        accuracy                           0.96      3785
       macro avg       0.95      0.95      0.95      3785
    weighted avg       0.96      0.96      0.96      3785



## 700 features

In [95]:
# Create list of top x words
words = list(word_importance['Word'].head(701))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [96]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 700

vector_xte columns: 7498
vector_xtest columns: 700


### NO SAMPLING ADJUSTMENT

In [97]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 625 ms
Wall time: 837 ms


In [98]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 11.6 ms


In [99]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('700_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.92      0.93       353
  database_admin       0.96      0.93      0.95       215
          deploy       0.92      0.99      0.95       606
   elasticsearch       0.98      0.95      0.97       434
          fastly       0.98      0.99      0.99       500
           redis       0.94      0.91      0.92       405
       resources       0.98      0.97      0.98       385
        sendgrid       0.98      0.92      0.95       183
upgrade_services       0.96      0.94      0.95       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.95      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [100]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [101]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 1.14 s
Wall time: 1.38 s


In [102]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 13.2 ms


In [103]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('700_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.91      0.92       353
  database_admin       0.95      0.96      0.96       215
          deploy       0.94      0.97      0.96       606
   elasticsearch       0.98      0.94      0.96       434
          fastly       0.99      0.98      0.99       500
           redis       0.92      0.92      0.92       405
       resources       0.98      0.97      0.97       385
        sendgrid       0.95      0.97      0.96       183
upgrade_services       0.95      0.95      0.95       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.96      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [104]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [105]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 297 ms
Wall time: 423 ms


In [106]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 18.9 ms


In [107]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('700_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.90      0.92       353
  database_admin       0.92      0.98      0.95       215
          deploy       0.93      0.98      0.95       606
   elasticsearch       0.98      0.94      0.96       434
          fastly       0.99      0.98      0.98       500
           redis       0.91      0.89      0.90       405
       resources       0.97      0.95      0.96       385
        sendgrid       0.94      0.96      0.95       183
upgrade_services       0.94      0.94      0.94       272
          upsize       0.97      1.00      0.98       432

        accuracy                           0.95      3785
       macro avg       0.95      0.95      0.95      3785
    weighted avg       0.95      0.95      0.95      3785



## 800 features

In [108]:
# Create list of top x words
words = list(word_importance['Word'].head(801))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [109]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 800

vector_xte columns: 7498
vector_xtest columns: 800


### NO SAMPLING ADJUSTMENT

In [110]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 750 ms
Wall time: 899 ms


In [111]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 18.4 ms


In [112]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('800_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.93      0.94       353
  database_admin       0.95      0.93      0.94       215
          deploy       0.92      0.99      0.95       606
   elasticsearch       0.97      0.95      0.96       434
          fastly       0.98      0.99      0.98       500
           redis       0.94      0.90      0.92       405
       resources       0.98      0.97      0.97       385
        sendgrid       0.98      0.92      0.95       183
upgrade_services       0.97      0.94      0.96       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.95      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [113]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [114]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 1.23 s
Wall time: 1.5 s


In [115]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 17.9 ms


In [116]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('800_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.92      0.93       353
  database_admin       0.93      0.96      0.94       215
          deploy       0.94      0.97      0.95       606
   elasticsearch       0.97      0.94      0.95       434
          fastly       0.99      0.99      0.99       500
           redis       0.94      0.91      0.92       405
       resources       0.97      0.96      0.97       385
        sendgrid       0.95      0.96      0.95       183
upgrade_services       0.95      0.95      0.95       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.95      0.95      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [117]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [118]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 344 ms
Wall time: 427 ms


In [119]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 15.2 ms


In [120]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('800_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.89      0.92       353
  database_admin       0.92      0.97      0.94       215
          deploy       0.93      0.98      0.95       606
   elasticsearch       0.98      0.93      0.95       434
          fastly       0.99      0.98      0.98       500
           redis       0.91      0.89      0.90       405
       resources       0.97      0.95      0.96       385
        sendgrid       0.95      0.96      0.95       183
upgrade_services       0.95      0.94      0.94       272
          upsize       0.97      1.00      0.98       432

        accuracy                           0.95      3785
       macro avg       0.95      0.95      0.95      3785
    weighted avg       0.95      0.95      0.95      3785



## 900 features

In [121]:
# Create list of top x words
words = list(word_importance['Word'].head(901))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [122]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 900

vector_xte columns: 7498
vector_xtest columns: 900


### NO SAMPLING ADJUSTMENT

In [123]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 766 ms
Wall time: 867 ms


In [124]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 15.2 ms


In [125]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('900_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.92      0.93       353
  database_admin       0.95      0.93      0.94       215
          deploy       0.91      0.99      0.95       606
   elasticsearch       0.97      0.95      0.96       434
          fastly       0.98      0.99      0.98       500
           redis       0.94      0.90      0.92       405
       resources       0.98      0.96      0.97       385
        sendgrid       0.97      0.93      0.95       183
upgrade_services       0.97      0.94      0.96       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.95      0.95      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [126]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [127]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 1.38 s
Wall time: 1.67 s


In [128]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 18 ms


In [129]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('900_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.90      0.92       353
  database_admin       0.94      0.96      0.95       215
          deploy       0.93      0.97      0.95       606
   elasticsearch       0.97      0.94      0.96       434
          fastly       0.99      0.99      0.99       500
           redis       0.93      0.91      0.92       405
       resources       0.97      0.97      0.97       385
        sendgrid       0.95      0.96      0.95       183
upgrade_services       0.95      0.94      0.95       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.95      0.95      0.95      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [130]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [131]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 312 ms
Wall time: 460 ms


In [132]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 0 ns
Wall time: 23.9 ms


In [133]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('900_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.89      0.92       353
  database_admin       0.92      0.97      0.95       215
          deploy       0.92      0.97      0.95       606
   elasticsearch       0.98      0.93      0.95       434
          fastly       0.99      0.98      0.98       500
           redis       0.91      0.89      0.90       405
       resources       0.96      0.95      0.96       385
        sendgrid       0.95      0.96      0.95       183
upgrade_services       0.94      0.94      0.94       272
          upsize       0.97      1.00      0.98       432

        accuracy                           0.95      3785
       macro avg       0.95      0.95      0.95      3785
    weighted avg       0.95      0.95      0.95      3785



## 1000 features

In [134]:
# Create list of top x words
words = list(word_importance['Word'].head(1001))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [135]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 1000

vector_xte columns: 7498
vector_xtest columns: 1000


### NO SAMPLING ADJUSTMENT

In [136]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 812 ms
Wall time: 871 ms


In [137]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 18.6 ms


In [138]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('1000_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.95      0.92      0.93       353
  database_admin       0.96      0.94      0.95       215
          deploy       0.91      0.99      0.95       606
   elasticsearch       0.97      0.95      0.96       434
          fastly       0.98      0.99      0.98       500
           redis       0.94      0.89      0.91       405
       resources       0.98      0.96      0.97       385
        sendgrid       0.97      0.93      0.95       183
upgrade_services       0.97      0.94      0.96       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.96      0.95      0.96      3785
    weighted avg       0.96      0.96      0.96      3785



### OVERSAMPLING

In [139]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [140]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 1.41 s
Wall time: 1.78 s


In [141]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 46.1 ms


In [142]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('1000_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.90      0.92       353
  database_admin       0.93      0.96      0.95       215
          deploy       0.93      0.98      0.95       606
   elasticsearch       0.97      0.94      0.95       434
          fastly       0.98      0.99      0.98       500
           redis       0.93      0.91      0.92       405
       resources       0.97      0.96      0.97       385
        sendgrid       0.95      0.96      0.95       183
upgrade_services       0.95      0.94      0.95       272
          upsize       0.98      1.00      0.99       432

        accuracy                           0.96      3785
       macro avg       0.95      0.95      0.95      3785
    weighted avg       0.96      0.96      0.96      3785



### UNDERSAMPLING

In [143]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [144]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 359 ms
Wall time: 592 ms


In [145]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 23.8 ms


In [146]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('1000_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.94      0.89      0.91       353
  database_admin       0.91      0.97      0.94       215
          deploy       0.92      0.97      0.95       606
   elasticsearch       0.98      0.93      0.95       434
          fastly       0.99      0.98      0.98       500
           redis       0.91      0.89      0.90       405
       resources       0.96      0.95      0.96       385
        sendgrid       0.96      0.96      0.96       183
upgrade_services       0.94      0.94      0.94       272
          upsize       0.97      1.00      0.99       432

        accuracy                           0.95      3785
       macro avg       0.95      0.95      0.95      3785
    weighted avg       0.95      0.95      0.95      3785

