### Logistic Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('.\merge.csv')

In [3]:

X = df['text']
y = df['subreddit']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=51,
                                                    stratify=y)

In [5]:
X_train.head()

1074                                            Well damn
540                                               Genius!
235       I printed a new blade for an old 1930s Germa...
282       I printed a new blade for an old 1930s Germa...
670                                               Genius!
Name: text, dtype: object

In [6]:
# Instantiate our CountVectorizer.
cvec = CountVectorizer(max_features = 500, stop_words = 'english')

In [7]:
X_train_cvec_1 = pd.DataFrame(cvec.fit_transform(X_train).todense(),
                          columns = cvec.get_feature_names())

X_test_cvec_1 = pd.DataFrame(cvec.transform(X_test).todense(),
                         columns = cvec.get_feature_names())

In [8]:
# Step 1: Instantiate our model.
logreg = LogisticRegression()

# Step 2: Fit our model.
logreg.fit(X_train_cvec_1, y_train)

print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [0.65775676]
Logistic Regression Coefficient: [[ 3.43563961e-01 -8.79032606e-02  1.93987493e-01 -2.77345656e-02
  -4.18963125e-01  3.43563961e-01  4.63583470e-02  3.73931745e-01
  -8.79032606e-02  4.63583470e-02 -8.79032606e-02 -8.79032606e-02
  -2.77345656e-02 -2.77345656e-02 -3.59892559e-01  4.63583470e-02
  -1.34072412e-04 -5.30256914e-02 -4.83868742e-01 -1.34072412e-04
  -3.09543853e-01 -1.34072412e-04 -3.78646410e-01  2.56596743e-01
  -2.47100833e-02 -3.09543853e-01 -6.12951706e-02  2.16482638e-02
  -6.31931773e-02 -2.68144823e-04 -1.34072412e-04  6.81320284e-02
  -1.34218763e-01 -8.29924581e-01 -1.60731609e-01 -3.82027280e-01
  -4.83868742e-01 -2.65128457e-02 -6.31931773e-02 -2.65128457e-02
  -2.77345656e-02 -2.77345656e-02 -2.77345656e-02 -2.47100833e-02
  -5.12229290e-02 -6.70362059e-04 -1.34072412e-04 -1.88573536e-01
  -4.83868742e-01 -1.34218763e-01 -1.34218763e-01 -2.68144823e-04
  -4.18963125e-01 -2.77345656e-02 -2.65128457e-02 -2.77345656e-02



In [9]:
logreg.score(X_train_cvec_1, y_train)

1.0

In [10]:
logreg.score(X_test_cvec_1, y_test)

1.0

### Confusion Matrics

In [None]:
predictions = logreg.predict(X_test_cvec_1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [None]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)