In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
import matplotlib.pyplot as plt
import util
import time

ModuleNotFoundError: No module named 'util'

In [None]:
folder_path = 'datasets/kaggle_comp/split_files/'
train_file = 'train.csv'
test_file = 'test.csv'
save_file = 'test_pred.csv'
train_X, train_Y = util.load_dataset(folder_path, train_file)
test_X, test_Y = util.load_dataset(folder_path, test_file)

In [None]:
model = Pipeline(steps=[('tfidf',TfidfVectorizer(stop_words=text.ENGLISH_STOP_WORDS)),('log', LogisticRegression())])
model.fit(train_X, train_Y)
pred_Y = model.predict(test_X)

In [None]:
disp = plot_confusion_matrix(model, test_X, test_Y,cmap=plt.cm.Blues)

In [None]:
print("Model Accuracy: " + str(round(accuracy_score(test_Y, pred_Y) * 100, 2)) + "%")

In [1]:
print(classification_report(test_Y,pred_Y))

NameError: name 'classification_report' is not defined

In [None]:
vocab=model.named_steps['tfidf'].vocabulary_
coef=list(model.named_steps['log'].coef_[0])

top_features=[]
for i in range(100):
    k = coef.index(max(coef))
    top_features.append(k)
    coef.pop(k)

for f in top_features:
    print({k:v for k, v in vocab.items() if v == f})

In [None]:
dual=[False]
penalty=['l2']
tol=[1e-3,1e-4,1e-5]
max_iter=[80,90,100]

param_grid = {
    'log__dual': dual,
    'log__penalty': penalty,
    'log__tol': tol,
    'log__max_iter': max_iter
}

In [None]:
grid=RandomizedSearchCV(model,param_grid, n_jobs=3, n_iter=10)
start_time = time.time()
grid_result = grid.fit(train_X, train_Y)
print("Execution time: " + str((time.time() - start_time)) + ' ms')
print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

In [None]:

optimized_pred_Y = grid.best_estimator_.predict(test_X)

disp = plot_confusion_matrix(grid.best_estimator_, test_X, test_Y,cmap=plt.cm.Blues)
plt.savefig("log_reg_optimized_confusion.png")


In [None]:
print("Model Accuracy: " + str(round(accuracy_score(test_Y, optimized_pred_Y) * 100, 2)) + "%")

In [None]:
print(classification_report(test_Y,optimized_pred_Y))

# N Gram

In [None]:
ngram = (1, 2)

In [None]:
model = Pipeline(steps=[('tfidf',TfidfVectorizer(stop_words=text.ENGLISH_STOP_WORDS, ngram_range=ngram)),('log', LogisticRegression())])
model.fit(train_X, train_Y)
pred_Y = model.predict(test_X)

In [None]:
grid=RandomizedSearchCV(model,param_grid, n_jobs=3, n_iter=10)
start_time = time.time()
grid_result = grid.fit(train_X, train_Y)
print("Execution time: " + str((time.time() - start_time)) + ' ms')
print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

In [None]:
optimized_pred_Y = grid.best_estimator_.predict(test_X)

disp = plot_confusion_matrix(grid.best_estimator_, test_X, test_Y,cmap=plt.cm.Blues)
plt.savefig("log_reg_optimized_confusion.png")

In [None]:
print("Model Accuracy: " + str(round(accuracy_score(test_Y, optimized_pred_Y) * 100, 2)) + "%")

In [None]:
print(classification_report(test_Y,optimized_pred_Y))