In [39]:
import pandas as pd
from sklearn import *
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from statistics import mean
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


# Part 1: Build a classification model using text data


CountVectorizer()


In [58]:
import aimodelshare as ai

(
    X_train,
    X_test,
    y_train_labels,
    y_test_labels,
    example_data,
    lstm_model,
    lstm_model2,
) = ai.import_quickstart_data("clickbait")



Data downloaded successfully.

Preparing downloaded files for use...

Success! Your Quick Start materials have been downloaded. 
You are now ready to run the tutorial.


In [59]:
vec = CountVectorizer().fit(X_train)
X_train = vec.transform(X_train)
#vec2 = CountVectorizer().fit(X_test)
X_test = vec.transform(X_test)


In [13]:
param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(LogisticRegression(max_iter=2000), param_grid, cv=5)
grid.fit(X_train, y_train_labels)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)


Best cross-validation score: 0.97
Best parameters:  {'C': 10}


In [42]:
log = LogisticRegression(C=10, max_iter=1000, random_state=35).fit(X_train, y_train_labels)

# logcv = cross_val_score(log, X_train, y_train_labels, scoring='f1', cv=10, n_jobs=-1, error_score='raise')
logcv = cross_val_score(
    log,
    X_train,
    y_train_labels,
    scoring="roc_auc",
    cv=10,
    n_jobs=-1,
    error_score="raise",
)
logcv.mean()


0.996085396897193

In [40]:
y_pred = log.predict(X_test)


print("Training Accuracy :", log.score(X_train, y_train_labels))
#print("Testing Accuracy :", log.score(X_test, y_test_labels))

# calculating the f1 score for the validation set
print("Test set F1 score :", f1_score(y_test_labels, y_pred, average="macro"))


Training Accuracy : 0.9998398654870091
Test set F1 score : 0.9735322772135928


TF-IDF


In [43]:
import aimodelshare as ai

(
    X_train,
    X_test,
    y_train_labels,
    y_test_labels,
    example_data,
    lstm_model,
    lstm_model2,
) = ai.import_quickstart_data("clickbait")



Data downloaded successfully.

Preparing downloaded files for use...

Success! Your Quick Start materials have been downloaded. 
You are now ready to run the tutorial.


In [44]:
tfidf = TfidfVectorizer(min_df=5, norm=None).fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)


In [46]:
log = LogisticRegression(C=10, max_iter=1000, random_state=35).fit(X_train, y_train_labels)

# logcv = cross_val_score(log, X_train, y_train_labels, scoring='f1', cv=10, n_jobs=-1, error_score='raise')
logcv = cross_val_score(
    log,
    X_train,
    y_train_labels,
    scoring="roc_auc",
    cv=10,
    n_jobs=-1,
    error_score="raise",
)
logcv.mean()


0.9927761120826164

In [48]:
y_pred = log.predict(X_test)


print("Training Accuracy :", log.score(X_train, y_train_labels))
#print("Testing Accuracy :", log.score(X_test, y_test_labels))

# calculating the f1 score for the validation set
print("Test set F1 score :", f1_score(y_test_labels, y_pred, average="macro"))


Training Accuracy : 0.9999599663717522
Test set F1 score : 0.9642478669084689


Bigram

In [54]:
import aimodelshare as ai

(
    X_train,
    X_test,
    y_train_labels,
    y_test_labels,
    example_data,
    lstm_model,
    lstm_model2,
) = ai.import_quickstart_data("clickbait")



Data downloaded successfully.

Preparing downloaded files for use...

Success! Your Quick Start materials have been downloaded. 
You are now ready to run the tutorial.


In [55]:
vec = CountVectorizer(ngram_range=(2,2)).fit(X_train)
X_train = vec.transform(X_train)
#vec2 = CountVectorizer().fit(X_test)
X_test = vec.transform(X_test)


In [56]:
log = LogisticRegression(C=10, max_iter=1000, random_state=35).fit(X_train, y_train_labels)

# logcv = cross_val_score(log, X_train, y_train_labels, scoring='f1', cv=10, n_jobs=-1, error_score='raise')
logcv = cross_val_score(
    log,
    X_train,
    y_train_labels,
    scoring="roc_auc",
    cv=10,
    n_jobs=-1,
    error_score="raise",
)
logcv.mean()


0.9870803681050493

In [57]:
y_pred = log.predict(X_test)


print("Training Accuracy :", log.score(X_train, y_train_labels))
#print("Testing Accuracy :", log.score(X_test, y_test_labels))

# calculating the f1 score for the validation set
print("Test set F1 score :", f1_score(y_test_labels, y_pred, average="macro"))


Training Accuracy : 1.0
Test set F1 score : 0.9265121984228963
