In [35]:
import string
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from sklearn import metrics

In [36]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [37]:
print("train = ",len(train))
print("test = ",len(test))
train = train.dropna().reset_index(drop=True)
test = test.dropna().reset_index(drop=True)

train =  1618
test =  100


In [38]:
train_target = train['label']
test_target = test['label']
train_text = train['text']
test_text = test['text']

In [39]:
all_text = pd.concat([train_text, test_text])

In [40]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [41]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [42]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
classifier = LogisticRegression(C=0.1, solver='sag')

In [43]:
classifier.fit(train_features, train_target)
val = classifier.predict_proba(test_features)[:, 1]
avg_val = val.mean()
pred_lst = []
for i in val:
    if i >= avg_val:
        pred_lst.append(1)
    else:
        pred_lst.append(0)
accuracy = accuracy_score(test_target, pred_lst)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 96.00%


In [44]:
print('accuracy::', accuracy_score(test_target, pred_lst))
print('precision::', metrics.precision_score(test_target, pred_lst, average='weighted'))
print('recall::', metrics.recall_score(test_target, pred_lst, average='weighted'))
print('f_score::', metrics.f1_score(test_target, pred_lst, average='weighted'))
print('f_score::', metrics.classification_report(test_target, pred_lst))

accuracy:: 0.96
precision:: 0.9636363636363636
recall:: 0.96
f_score:: 0.9602627257799672
f_score::               precision    recall  f1-score   support

           0       0.91      1.00      0.95        40
           1       1.00      0.93      0.97        60

    accuracy                           0.96       100
   macro avg       0.95      0.97      0.96       100
weighted avg       0.96      0.96      0.96       100



In [45]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(classifier, f)

In [46]:
with open('model.pkl', 'rb') as f:
    clf_loaded = pickle.load(f)

In [47]:
clf_loaded

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)