# Word Count Test

In [147]:
# Data Loading
import pandas as pd
import numpy as np

# Data Splitting and Resampling
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Reports
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Model
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neural_network import MLPClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import LinearSVC

# Get valid model
# mdl = DecisionTreeClassifier()
# mdl = MLPClassifier()
# mdl = LogisticRegression(multi_class = 'multinomial')
# mdl = MultinomialNB()
mdl = RandomForestClassifier()
# mdl = LinearSVC()

# DATA PREP

In [148]:
# Read dataset
df = pd.read_excel('input_og_categories.xlsx')

# Drop any rows with null values
df = df.dropna()

# Clarify independent and dependent variables
x = df['text']
y = df['type']

# Split datasets into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Export ytest
ytest.to_csv('ytest.csv', index=False)

# Define vectorizer
vectorizer = TfidfVectorizer()

# Vectorize xtrain
vector_xtrain = vectorizer.fit_transform(xtrain)

# Store xtrain feature names
feats_xtr = vectorizer.get_feature_names_out()

# Vectorize xtest
vector_xtest = vectorizer.transform(xtest)

# Store xtest feature names
feats_xte = vectorizer.get_feature_names_out()

# Create dataframes of xtrain and xtest
vector_xtr = pd.DataFrame.sparse.from_spmatrix(vector_xtrain, columns = feats_xtr)
vector_xte = pd.DataFrame.sparse.from_spmatrix(vector_xtest, columns = feats_xte)

# Convert xtrain and xtest to dense values
vector_xtr = vector_xtr.sparse.to_dense().astype(np.float64)
vector_xte = vector_xte.sparse.to_dense().astype(np.float64)

# WORD COUNTS

In [149]:
# Retrieve word importance list
word_importance = pd.read_excel('word_importance_lists.xlsx')

# Create list of xtrain columns
xtrain_cols = list(vector_xtr.columns)

# Create list of xtest columns
xtest_cols = list(vector_xte.columns)

## 50 features

In [150]:
# Create list of top x words
words = list(word_importance['word'].head(51))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [151]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 50

vector_xte columns: 7498
vector_xtest columns: 50


### NO SAMPLING ADJUSTMENT

In [152]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 7.05 s
Wall time: 9.07 s


In [153]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 46.9 ms
Wall time: 95 ms


In [154]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('50_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.87      0.88      0.87       525
          deploy       0.95      0.97      0.96       901
   elasticsearch       0.84      0.86      0.85       268
          fastly       0.98      0.99      0.98       552
           redis       0.80      0.76      0.78       173
       resources       0.88      0.83      0.86       379
        sendgrid       0.96      0.96      0.96       137
upgrade_services       0.89      0.89      0.89       355
          upsize       0.96      0.94      0.95       495

        accuracy                           0.92      3785
       macro avg       0.90      0.90      0.90      3785
    weighted avg       0.92      0.92      0.92      3785



### OVERSAMPLING

In [155]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [156]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 12.6 s
Wall time: 15.1 s


In [157]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 62.5 ms
Wall time: 126 ms


In [158]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('50_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.89      0.88       525
          deploy       0.96      0.96      0.96       901
   elasticsearch       0.84      0.89      0.86       268
          fastly       0.98      0.99      0.98       552
           redis       0.75      0.77      0.76       173
       resources       0.88      0.85      0.86       379
        sendgrid       0.96      0.95      0.96       137
upgrade_services       0.90      0.88      0.89       355
          upsize       0.96      0.94      0.95       495

        accuracy                           0.92      3785
       macro avg       0.90      0.90      0.90      3785
    weighted avg       0.92      0.92      0.92      3785



### UNDERSAMPLING

In [159]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [160]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 2.17 s
Wall time: 2.52 s


In [161]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 62.5 ms
Wall time: 83.8 ms


In [162]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('50_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.91      0.79      0.85       525
          deploy       0.96      0.92      0.94       901
   elasticsearch       0.79      0.88      0.83       268
          fastly       0.97      0.98      0.98       552
           redis       0.60      0.86      0.70       173
       resources       0.82      0.86      0.84       379
        sendgrid       0.92      0.96      0.94       137
upgrade_services       0.88      0.86      0.87       355
          upsize       0.97      0.93      0.95       495

        accuracy                           0.90      3785
       macro avg       0.87      0.89      0.88      3785
    weighted avg       0.90      0.90      0.90      3785



## 100 features

In [163]:
# Create list of top x words
words = list(word_importance['word'].head(101))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [164]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 100

vector_xte columns: 7498
vector_xtest columns: 100


### NO SAMPLING ADJUSTMENT

In [165]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 10.5 s
Wall time: 12.2 s


In [166]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 62.5 ms
Wall time: 107 ms


In [167]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('100_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.87      0.89      0.88       525
          deploy       0.95      0.98      0.96       901
   elasticsearch       0.86      0.85      0.86       268
          fastly       0.97      0.99      0.98       552
           redis       0.83      0.76      0.79       173
       resources       0.88      0.85      0.86       379
        sendgrid       0.98      0.96      0.97       137
upgrade_services       0.88      0.88      0.88       355
          upsize       0.97      0.94      0.95       495

        accuracy                           0.92      3785
       macro avg       0.91      0.90      0.90      3785
    weighted avg       0.92      0.92      0.92      3785



### OVERSAMPLING

In [168]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [169]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 18.8 s
Wall time: 21 s


In [170]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 93.8 ms
Wall time: 117 ms


In [171]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('100_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.87      0.88       525
          deploy       0.95      0.97      0.96       901
   elasticsearch       0.85      0.85      0.85       268
          fastly       0.97      0.99      0.98       552
           redis       0.82      0.82      0.82       173
       resources       0.85      0.84      0.85       379
        sendgrid       0.98      0.98      0.98       137
upgrade_services       0.87      0.88      0.88       355
          upsize       0.98      0.93      0.96       495

        accuracy                           0.92      3785
       macro avg       0.90      0.90      0.90      3785
    weighted avg       0.92      0.92      0.92      3785



### UNDERSAMPLING

In [172]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [173]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 3.06 s
Wall time: 3.33 s


In [174]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 15.6 ms
Wall time: 93.3 ms


In [175]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('100_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.91      0.79      0.85       525
          deploy       0.96      0.93      0.94       901
   elasticsearch       0.79      0.87      0.83       268
          fastly       0.96      0.99      0.97       552
           redis       0.63      0.88      0.74       173
       resources       0.81      0.88      0.84       379
        sendgrid       0.96      0.97      0.96       137
upgrade_services       0.87      0.86      0.87       355
          upsize       0.98      0.93      0.95       495

        accuracy                           0.90      3785
       macro avg       0.88      0.90      0.88      3785
    weighted avg       0.91      0.90      0.90      3785



## 200 features

In [176]:
# Create list of top x words
words = list(word_importance['word'].head(201))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [177]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 200

vector_xte columns: 7498
vector_xtest columns: 200


### NO SAMPLING ADJUSTMENT

In [178]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 13.2 s
Wall time: 15.8 s


In [179]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 78.1 ms
Wall time: 119 ms


In [180]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('200_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.89      0.90      0.89       525
          deploy       0.95      0.98      0.97       901
   elasticsearch       0.86      0.84      0.85       268
          fastly       0.97      0.99      0.98       552
           redis       0.88      0.79      0.83       173
       resources       0.89      0.86      0.87       379
        sendgrid       0.98      0.95      0.96       137
upgrade_services       0.87      0.89      0.88       355
          upsize       0.97      0.94      0.95       495

        accuracy                           0.92      3785
       macro avg       0.92      0.90      0.91      3785
    weighted avg       0.92      0.92      0.92      3785



### OVERSAMPLING

In [181]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [182]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 24.5 s
Wall time: 27 s


In [183]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 46.9 ms
Wall time: 122 ms


In [184]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('200_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.87      0.88      0.87       525
          deploy       0.95      0.97      0.96       901
   elasticsearch       0.85      0.84      0.84       268
          fastly       0.97      0.99      0.98       552
           redis       0.85      0.83      0.84       173
       resources       0.86      0.85      0.86       379
        sendgrid       0.97      0.97      0.97       137
upgrade_services       0.88      0.88      0.88       355
          upsize       0.97      0.93      0.95       495

        accuracy                           0.92      3785
       macro avg       0.91      0.90      0.91      3785
    weighted avg       0.92      0.92      0.92      3785



### UNDERSAMPLING

In [185]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [186]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 3.55 s
Wall time: 4.19 s


In [187]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 62.5 ms
Wall time: 105 ms


In [188]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('200_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.90      0.79      0.84       525
          deploy       0.96      0.93      0.95       901
   elasticsearch       0.78      0.88      0.83       268
          fastly       0.97      0.98      0.97       552
           redis       0.71      0.91      0.80       173
       resources       0.81      0.87      0.84       379
        sendgrid       0.94      0.98      0.96       137
upgrade_services       0.87      0.86      0.87       355
          upsize       0.98      0.92      0.95       495

        accuracy                           0.90      3785
       macro avg       0.88      0.90      0.89      3785
    weighted avg       0.91      0.90      0.90      3785



## 300 features

In [189]:
# Create list of top x words
words = list(word_importance['word'].head(301))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [190]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 300

vector_xte columns: 7498
vector_xtest columns: 300


### NO SAMPLING ADJUSTMENT

In [191]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 14.5 s
Wall time: 17.6 s


In [192]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 93.8 ms
Wall time: 128 ms


In [193]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('300_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.87      0.88       525
          deploy       0.94      0.98      0.96       901
   elasticsearch       0.86      0.82      0.84       268
          fastly       0.97      0.99      0.98       552
           redis       0.88      0.77      0.82       173
       resources       0.86      0.85      0.86       379
        sendgrid       0.98      0.96      0.97       137
upgrade_services       0.87      0.89      0.88       355
          upsize       0.97      0.94      0.96       495

        accuracy                           0.92      3785
       macro avg       0.91      0.90      0.90      3785
    weighted avg       0.92      0.92      0.92      3785



### OVERSAMPLING

In [194]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [195]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 25.7 s
Wall time: 30.1 s


In [196]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 78.1 ms
Wall time: 137 ms


In [197]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('300_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.88      0.88       525
          deploy       0.95      0.97      0.96       901
   elasticsearch       0.86      0.85      0.85       268
          fastly       0.97      0.99      0.98       552
           redis       0.86      0.84      0.85       173
       resources       0.87      0.86      0.87       379
        sendgrid       0.96      0.96      0.96       137
upgrade_services       0.88      0.90      0.89       355
          upsize       0.97      0.93      0.95       495

        accuracy                           0.92      3785
       macro avg       0.91      0.91      0.91      3785
    weighted avg       0.92      0.92      0.92      3785



### UNDERSAMPLING

In [198]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [199]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 3.75 s
Wall time: 4.53 s


In [200]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 31.2 ms
Wall time: 112 ms


In [201]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('300_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.90      0.79      0.84       525
          deploy       0.96      0.93      0.95       901
   elasticsearch       0.79      0.85      0.82       268
          fastly       0.96      0.98      0.97       552
           redis       0.69      0.91      0.79       173
       resources       0.80      0.88      0.84       379
        sendgrid       0.93      0.97      0.95       137
upgrade_services       0.86      0.85      0.86       355
          upsize       0.97      0.92      0.94       495

        accuracy                           0.90      3785
       macro avg       0.88      0.90      0.88      3785
    weighted avg       0.90      0.90      0.90      3785



## 400 features

In [202]:
# Create list of top x words
words = list(word_importance['word'].head(401))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [203]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 400

vector_xte columns: 7498
vector_xtest columns: 400


### NO SAMPLING ADJUSTMENT

In [204]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 16 s
Wall time: 18.3 s


In [205]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 31.2 ms
Wall time: 145 ms


In [206]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('400_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.86      0.89      0.88       525
          deploy       0.94      0.98      0.96       901
   elasticsearch       0.87      0.82      0.84       268
          fastly       0.97      0.99      0.98       552
           redis       0.92      0.77      0.84       173
       resources       0.86      0.86      0.86       379
        sendgrid       0.98      0.96      0.97       137
upgrade_services       0.87      0.88      0.87       355
          upsize       0.97      0.94      0.95       495

        accuracy                           0.92      3785
       macro avg       0.92      0.90      0.91      3785
    weighted avg       0.92      0.92      0.92      3785



### OVERSAMPLING

In [207]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [208]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 29 s
Wall time: 31.5 s


In [209]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 125 ms
Wall time: 178 ms


In [210]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('400_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.89      0.88       525
          deploy       0.94      0.97      0.96       901
   elasticsearch       0.87      0.84      0.86       268
          fastly       0.97      0.99      0.98       552
           redis       0.90      0.81      0.85       173
       resources       0.87      0.86      0.87       379
        sendgrid       0.97      0.97      0.97       137
upgrade_services       0.88      0.89      0.89       355
          upsize       0.97      0.94      0.96       495

        accuracy                           0.92      3785
       macro avg       0.92      0.91      0.91      3785
    weighted avg       0.92      0.92      0.92      3785



### UNDERSAMPLING

In [211]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [212]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 4.3 s
Wall time: 4.85 s


In [213]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 46.9 ms
Wall time: 113 ms


In [214]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('400_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.89      0.77      0.83       525
          deploy       0.96      0.93      0.95       901
   elasticsearch       0.80      0.85      0.83       268
          fastly       0.96      0.98      0.97       552
           redis       0.69      0.90      0.78       173
       resources       0.80      0.88      0.84       379
        sendgrid       0.92      0.97      0.94       137
upgrade_services       0.86      0.86      0.86       355
          upsize       0.98      0.92      0.95       495

        accuracy                           0.90      3785
       macro avg       0.87      0.90      0.88      3785
    weighted avg       0.90      0.90      0.90      3785



## 500 features

In [215]:
# Create list of top x words
words = list(word_importance['word'].head(501))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [216]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 500

vector_xte columns: 7498
vector_xtest columns: 500


### NO SAMPLING ADJUSTMENT

In [217]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 16.2 s
Wall time: 18.3 s


In [218]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 62.5 ms
Wall time: 149 ms


In [219]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('500_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.89      0.89      0.89       525
          deploy       0.94      0.98      0.96       901
   elasticsearch       0.87      0.82      0.84       268
          fastly       0.97      0.99      0.98       552
           redis       0.92      0.77      0.84       173
       resources       0.87      0.87      0.87       379
        sendgrid       0.98      0.96      0.97       137
upgrade_services       0.87      0.89      0.88       355
          upsize       0.97      0.94      0.95       495

        accuracy                           0.92      3785
       macro avg       0.92      0.90      0.91      3785
    weighted avg       0.92      0.92      0.92      3785



### OVERSAMPLING

In [220]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [221]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 28.9 s
Wall time: 33.2 s


In [222]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 109 ms
Wall time: 159 ms


In [223]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('500_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.87      0.87       525
          deploy       0.95      0.97      0.96       901
   elasticsearch       0.85      0.83      0.84       268
          fastly       0.97      0.98      0.98       552
           redis       0.88      0.82      0.85       173
       resources       0.85      0.87      0.86       379
        sendgrid       0.96      0.96      0.96       137
upgrade_services       0.86      0.88      0.87       355
          upsize       0.97      0.94      0.95       495

        accuracy                           0.92      3785
       macro avg       0.91      0.90      0.90      3785
    weighted avg       0.92      0.92      0.92      3785



### UNDERSAMPLING

In [224]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [225]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 4.11 s
Wall time: 4.84 s


In [226]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 62.5 ms
Wall time: 120 ms


In [227]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('500_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.91      0.79      0.84       525
          deploy       0.96      0.93      0.94       901
   elasticsearch       0.79      0.86      0.82       268
          fastly       0.96      0.98      0.97       552
           redis       0.72      0.90      0.80       173
       resources       0.79      0.89      0.84       379
        sendgrid       0.93      0.97      0.95       137
upgrade_services       0.86      0.85      0.85       355
          upsize       0.98      0.92      0.95       495

        accuracy                           0.90      3785
       macro avg       0.88      0.90      0.88      3785
    weighted avg       0.90      0.90      0.90      3785



## 600 features

In [228]:
# Create list of top x words
words = list(word_importance['word'].head(602))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [229]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 600

vector_xte columns: 7498
vector_xtest columns: 600


### NO SAMPLING ADJUSTMENT

In [230]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 16.6 s
Wall time: 18.6 s


In [231]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 93.8 ms
Wall time: 143 ms


In [232]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('600_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.86      0.88      0.87       525
          deploy       0.94      0.97      0.96       901
   elasticsearch       0.85      0.84      0.85       268
          fastly       0.97      0.99      0.98       552
           redis       0.92      0.76      0.83       173
       resources       0.86      0.86      0.86       379
        sendgrid       0.98      0.93      0.96       137
upgrade_services       0.87      0.88      0.88       355
          upsize       0.97      0.93      0.95       495

        accuracy                           0.92      3785
       macro avg       0.91      0.89      0.90      3785
    weighted avg       0.92      0.92      0.92      3785



### OVERSAMPLING

In [233]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [234]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 29.8 s
Wall time: 34 s


In [235]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 109 ms
Wall time: 220 ms


In [236]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('600_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.86      0.87       525
          deploy       0.95      0.96      0.95       901
   elasticsearch       0.83      0.83      0.83       268
          fastly       0.96      0.98      0.97       552
           redis       0.89      0.81      0.85       173
       resources       0.85      0.87      0.86       379
        sendgrid       0.96      0.96      0.96       137
upgrade_services       0.86      0.88      0.87       355
          upsize       0.97      0.94      0.95       495

        accuracy                           0.91      3785
       macro avg       0.90      0.90      0.90      3785
    weighted avg       0.91      0.91      0.91      3785



### UNDERSAMPLING

In [237]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [238]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 4.41 s
Wall time: 4.95 s


In [239]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 93.8 ms
Wall time: 132 ms


In [240]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('600_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.90      0.78      0.84       525
          deploy       0.96      0.92      0.94       901
   elasticsearch       0.77      0.86      0.81       268
          fastly       0.95      0.98      0.97       552
           redis       0.72      0.90      0.80       173
       resources       0.79      0.88      0.83       379
        sendgrid       0.91      0.98      0.94       137
upgrade_services       0.87      0.85      0.86       355
          upsize       0.98      0.92      0.95       495

        accuracy                           0.90      3785
       macro avg       0.87      0.90      0.88      3785
    weighted avg       0.90      0.90      0.90      3785



## 700 features

In [241]:
# Create list of top x words
words = list(word_importance['word'].head(703))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [242]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 700

vector_xte columns: 7498
vector_xtest columns: 700


### NO SAMPLING ADJUSTMENT

In [243]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 17.1 s
Wall time: 19.1 s


In [244]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 62.5 ms
Wall time: 151 ms


In [245]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('700_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.87      0.88      0.87       525
          deploy       0.93      0.98      0.95       901
   elasticsearch       0.85      0.82      0.83       268
          fastly       0.97      0.99      0.98       552
           redis       0.91      0.77      0.83       173
       resources       0.86      0.86      0.86       379
        sendgrid       0.98      0.94      0.96       137
upgrade_services       0.88      0.88      0.88       355
          upsize       0.97      0.93      0.95       495

        accuracy                           0.92      3785
       macro avg       0.91      0.89      0.90      3785
    weighted avg       0.92      0.92      0.91      3785



### OVERSAMPLING

In [246]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [247]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 29.9 s
Wall time: 34.7 s


In [248]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 93.8 ms
Wall time: 181 ms


In [249]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('700_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.89      0.86      0.87       525
          deploy       0.95      0.97      0.96       901
   elasticsearch       0.85      0.82      0.83       268
          fastly       0.97      0.99      0.98       552
           redis       0.88      0.83      0.85       173
       resources       0.84      0.87      0.86       379
        sendgrid       0.98      0.96      0.97       137
upgrade_services       0.86      0.89      0.88       355
          upsize       0.97      0.93      0.95       495

        accuracy                           0.92      3785
       macro avg       0.91      0.90      0.91      3785
    weighted avg       0.92      0.92      0.92      3785



### UNDERSAMPLING

In [250]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [251]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 4.56 s
Wall time: 4.93 s


In [252]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 31.2 ms
Wall time: 124 ms


In [253]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('700_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.91      0.79      0.84       525
          deploy       0.96      0.92      0.94       901
   elasticsearch       0.77      0.84      0.81       268
          fastly       0.95      0.97      0.96       552
           redis       0.71      0.88      0.79       173
       resources       0.79      0.89      0.84       379
        sendgrid       0.93      0.98      0.95       137
upgrade_services       0.85      0.86      0.85       355
          upsize       0.97      0.92      0.94       495

        accuracy                           0.89      3785
       macro avg       0.87      0.89      0.88      3785
    weighted avg       0.90      0.89      0.90      3785



## 800 features

In [254]:
# Create list of top x words
words = list(word_importance['word'].head(803))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [255]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 800

vector_xte columns: 7498
vector_xtest columns: 800


### NO SAMPLING ADJUSTMENT

In [256]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 17.2 s
Wall time: 18.9 s


In [257]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 62.5 ms
Wall time: 152 ms


In [258]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('800_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.87      0.87      0.87       525
          deploy       0.93      0.98      0.95       901
   elasticsearch       0.85      0.81      0.83       268
          fastly       0.96      0.99      0.98       552
           redis       0.92      0.72      0.81       173
       resources       0.86      0.85      0.85       379
        sendgrid       0.99      0.94      0.97       137
upgrade_services       0.85      0.88      0.87       355
          upsize       0.96      0.94      0.95       495

        accuracy                           0.91      3785
       macro avg       0.91      0.89      0.90      3785
    weighted avg       0.91      0.91      0.91      3785



### OVERSAMPLING

In [259]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [260]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 30.2 s
Wall time: 33.9 s


In [261]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 93.8 ms
Wall time: 196 ms


In [262]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('800_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.87      0.88       525
          deploy       0.94      0.96      0.95       901
   elasticsearch       0.81      0.82      0.82       268
          fastly       0.97      0.98      0.97       552
           redis       0.89      0.84      0.86       173
       resources       0.85      0.87      0.86       379
        sendgrid       0.95      0.95      0.95       137
upgrade_services       0.85      0.88      0.87       355
          upsize       0.98      0.93      0.95       495

        accuracy                           0.91      3785
       macro avg       0.90      0.90      0.90      3785
    weighted avg       0.91      0.91      0.91      3785



### UNDERSAMPLING

In [263]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [264]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 4.34 s
Wall time: 5.03 s


In [265]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 93.8 ms
Wall time: 130 ms


In [266]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('800_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.90      0.79      0.84       525
          deploy       0.96      0.92      0.94       901
   elasticsearch       0.77      0.85      0.81       268
          fastly       0.96      0.97      0.97       552
           redis       0.71      0.91      0.80       173
       resources       0.79      0.87      0.83       379
        sendgrid       0.93      0.97      0.95       137
upgrade_services       0.85      0.86      0.85       355
          upsize       0.98      0.92      0.95       495

        accuracy                           0.90      3785
       macro avg       0.87      0.90      0.88      3785
    weighted avg       0.90      0.90      0.90      3785



## 900 features

In [267]:
# Create list of top x words
words = list(word_importance['word'].head(903))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [268]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 900

vector_xte columns: 7498
vector_xtest columns: 900


### NO SAMPLING ADJUSTMENT

In [269]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 15 s
Wall time: 24.2 s


In [270]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 31.2 ms
Wall time: 190 ms


In [271]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('900_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.87      0.89      0.88       525
          deploy       0.93      0.97      0.95       901
   elasticsearch       0.84      0.79      0.81       268
          fastly       0.97      0.99      0.98       552
           redis       0.91      0.71      0.79       173
       resources       0.87      0.86      0.86       379
        sendgrid       0.99      0.93      0.96       137
upgrade_services       0.85      0.87      0.86       355
          upsize       0.96      0.93      0.95       495

        accuracy                           0.91      3785
       macro avg       0.91      0.88      0.89      3785
    weighted avg       0.91      0.91      0.91      3785



### OVERSAMPLING

In [272]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [273]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 26.2 s
Wall time: 38.9 s


In [274]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 78.1 ms
Wall time: 246 ms


In [275]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('900_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.86      0.87       525
          deploy       0.95      0.96      0.95       901
   elasticsearch       0.81      0.81      0.81       268
          fastly       0.96      0.98      0.97       552
           redis       0.89      0.83      0.86       173
       resources       0.83      0.88      0.85       379
        sendgrid       0.97      0.96      0.97       137
upgrade_services       0.85      0.87      0.86       355
          upsize       0.98      0.94      0.96       495

        accuracy                           0.91      3785
       macro avg       0.90      0.90      0.90      3785
    weighted avg       0.91      0.91      0.91      3785



### UNDERSAMPLING

In [276]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [277]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 3.56 s
Wall time: 5.64 s


In [278]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 93.8 ms
Wall time: 134 ms


In [279]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('900_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.91      0.77      0.83       525
          deploy       0.96      0.91      0.94       901
   elasticsearch       0.75      0.84      0.79       268
          fastly       0.95      0.98      0.96       552
           redis       0.70      0.91      0.79       173
       resources       0.79      0.89      0.83       379
        sendgrid       0.93      0.97      0.95       137
upgrade_services       0.83      0.84      0.84       355
          upsize       0.98      0.92      0.95       495

        accuracy                           0.89      3785
       macro avg       0.87      0.89      0.88      3785
    weighted avg       0.90      0.89      0.89      3785



## 1000 features

In [280]:
# Create list of top x words
words = list(word_importance['word'].head(1003))

# Create set of xtrain words
xtrain_set = set(words) & set(xtrain_cols)

# Convert set to list
xtrain_list = list(xtrain_set)

# Create set of xtest columns
xtest_set = set(words) & set(xtest_cols)

# Convert set to list
xtest_list = list(xtest_set)

# Filter xtrain and xtest to relevant columns
vector_xtrain = vector_xtr[xtrain_list]
vector_xtest = vector_xte[xtest_list]

In [281]:
print(f'vector_xtr columns: {len(vector_xtr.columns)}')
print(f'vector_xtrain columns: {len(vector_xtrain.columns)}')
print()
print(f'vector_xte columns: {len(vector_xte.columns)}')
print(f'vector_xtest columns: {len(vector_xtest.columns)}')

vector_xtr columns: 7498
vector_xtrain columns: 1000

vector_xte columns: 7498
vector_xtest columns: 1000


### NO SAMPLING ADJUSTMENT

In [282]:
%%time
# Fit the model
mdl.fit(vector_xtrain, ytrain)

CPU times: total: 14.5 s
Wall time: 19.6 s


In [283]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 46.9 ms
Wall time: 169 ms


In [284]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('1000_ns.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.88      0.85      0.87       525
          deploy       0.93      0.98      0.95       901
   elasticsearch       0.83      0.81      0.82       268
          fastly       0.96      0.99      0.98       552
           redis       0.91      0.78      0.84       173
       resources       0.85      0.87      0.86       379
        sendgrid       0.99      0.93      0.96       137
upgrade_services       0.86      0.87      0.86       355
          upsize       0.97      0.93      0.95       495

        accuracy                           0.91      3785
       macro avg       0.91      0.89      0.90      3785
    weighted avg       0.91      0.91      0.91      3785



### OVERSAMPLING

In [285]:
# Create OS sets
overs = RandomOverSampler(random_state = 0)
vector_xtrain_os, ytrain_os = overs.fit_resample(vector_xtrain, ytrain)

In [286]:
%%time
# Fit the model
mdl.fit(vector_xtrain_os, ytrain_os)

CPU times: total: 29 s
Wall time: 35.5 s


In [287]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 46.9 ms
Wall time: 206 ms


In [288]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('1000_os.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.89      0.87      0.88       525
          deploy       0.94      0.97      0.95       901
   elasticsearch       0.83      0.85      0.84       268
          fastly       0.97      0.98      0.97       552
           redis       0.87      0.83      0.85       173
       resources       0.85      0.87      0.86       379
        sendgrid       0.98      0.95      0.96       137
upgrade_services       0.87      0.88      0.87       355
          upsize       0.98      0.94      0.96       495

        accuracy                           0.92      3785
       macro avg       0.91      0.90      0.90      3785
    weighted avg       0.92      0.92      0.92      3785



### UNDERSAMPLING

In [289]:
# Create OS sets
unders = RandomUnderSampler(random_state = 0)
vector_xtrain_us, ytrain_us = unders.fit_resample(vector_xtrain, ytrain)

In [290]:
%%time
# Fit the model
mdl.fit(vector_xtrain_us, ytrain_us)

CPU times: total: 4.38 s
Wall time: 5.17 s


In [291]:
%%time
# Make predictions
preds = mdl.predict(vector_xtest)

CPU times: total: 78.1 ms
Wall time: 133 ms


In [292]:
# Store and export predictions
preds2 = pd.DataFrame(preds)
preds2.to_csv('1000_us.csv', index=False)

# Generate classification report
print(classification_report(ytest, preds))

                  precision    recall  f1-score   support

        database       0.89      0.78      0.83       525
          deploy       0.96      0.91      0.93       901
   elasticsearch       0.78      0.87      0.82       268
          fastly       0.95      0.98      0.97       552
           redis       0.71      0.90      0.79       173
       resources       0.78      0.88      0.83       379
        sendgrid       0.93      0.98      0.95       137
upgrade_services       0.86      0.87      0.87       355
          upsize       0.98      0.91      0.94       495

        accuracy                           0.89      3785
       macro avg       0.87      0.90      0.88      3785
    weighted avg       0.90      0.89      0.89      3785

