### NB-LogisticRegression
[Source](https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline)

In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
encoded_label_dict = {"CG" : 0, "OR" : 1}
def encode_label(x):
    return encoded_label_dict.get(x,-1)

In [None]:
df = pd.read_csv("/new.csv")

In [None]:
df["target"] = df["label"].apply(lambda x: encode_label(x))

In [None]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, stratify=None, random_state=2021)

In [None]:
train.head()

Unnamed: 0,category,rating,label,text_,target
8394,Smartdevices,4,OR,Voice recognition unable to make,1
3323,Smartdevices,4,OR,Product is good But battery not lasting for...,1
8960,Smartdevices,5,CG,Very good TV,0
6442,Smartdevices,3,CG,Not a good android tv The software is buggy...,0
3176,Smartdevices,5,OR,Check Ratings,1


In [None]:
len(train),len(test)

(7546, 1887)

In [None]:
COMMENT = 'text_'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

## Building the model

We'll start by creating a *bag of words* representation, as a *term document matrix*. We'll use ngrams, as suggested in the NBSVM paper.

In [None]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

It turns out that using TF-IDF gives even better priors than the binarized features used in the paper. I don't think this has been mentioned in any paper before, but it improves leaderboard score from 0.59 to 0.55.

In [None]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

This creates a *sparse matrix* with only a small number of non-zero elements (*stored elements* in the representation  below).

In [None]:
trn_term_doc, test_term_doc

(<7546x14107 sparse matrix of type '<class 'numpy.float64'>'
 	with 267651 stored elements in Compressed Sparse Row format>,
 <1887x14107 sparse matrix of type '<class 'numpy.float64'>'
 	with 64796 stored elements in Compressed Sparse Row format>)

Here's the basic naive bayes feature equation:

In [None]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [None]:
x = trn_term_doc
test_x = test_term_doc

Fit a model for one dependent at a time:

In [None]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [None]:
m,r = get_mdl(train["target"])
preds_probas = m.predict_proba(test_x.multiply(r))[:,1]

In [None]:
preds = [1 if prob>=0.5 else 0 for prob in preds_probas]

In [None]:
from sklearn.metrics import confusion_matrix
y_true = test.target.values
y_pred = preds
confusion_matrix(y_true,y_pred)

array([[884, 183],
       [ 72, 748]])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
acc = accuracy_score(y_true,y_pred)
precision = precision_score(y_true,y_pred)
recall = recall_score(y_true,y_pred)

In [None]:
print(f"Accuracy: {acc*100}; Precision:{precision*100}; Recall:{recall*100}")

Accuracy: 86.48648648648648; Precision:80.343716433942; Recall:91.21951219512195


In [None]:
print(classification_report(y_true, y_pred, target_names=["CG","OR"]))

              precision    recall  f1-score   support

          CG       0.92      0.83      0.87      1067
          OR       0.80      0.91      0.85       820

    accuracy                           0.86      1887
   macro avg       0.86      0.87      0.86      1887
weighted avg       0.87      0.86      0.87      1887



In [None]:
preds_df_rows = []
for i, row in test.reset_index().iterrows():
    query = row["text_"]
    pred_prob = preds_probas[i]
    pred_label = preds[i]
    preds_df_rows.append([pred_prob,pred_label])
preds_df = pd.DataFrame(preds_df_rows, columns=["NbLogReg_Model_Probability","NbLogReg_Model_Prediction"])

In [None]:
preds_df.to_csv("/NbLogReg_predictions2.csv", index=None)