### EDA and Count Vectorization

In [1]:
import numpy as np
import pandas as pd
import datetime
import time
import regex as re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords # Import the stopword list

In [2]:
df = pd.read_csv('data/reddit_df.csv')

In [3]:
df.head(2)

Unnamed: 0,text,unethical
0,"if an online video has no controls to skip, p...",0
1,i always judge people's maturity based on the...,0


In [4]:
scores = []

In [8]:
# tokenizer = RegexpTokenizer(r'\w+')
# text_tokens = tokenizer.tokenize(text.lower())
# tokenizer_1 = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
# # Instantiate tokenizer.
# tokenizer_2 = RegexpTokenizer('\s+', gaps=True)

# # Run tokenizer.
# tokenizer_2.tokenize(s)

# # Use regular expressions to do a find-and-replace
# letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
#                       " ",                   # The pattern to replace it with
#                       example1.get_text())   # The text to search
# # Convert letters_only to lower case.
# lower_case = letters_only.lower()

# # Split lower_case up at each space.
# words = lower_case.split()  #this is tokenization with a different method!
# words = [i for i in words if i not in stopwords.words('english')]

#     # 2. Remove non-letters.
#     letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
#     # 3. Convert to lower case, split into individual words.
#     words = letters_only.lower().split()
    
#     # 4. In Python, searching a set is much faster than searching
#     # a list, so convert the stopwords to a set.
#     stops = set(stopwords.words('english'))
    
#     # 5. Remove stopwords.
#     meaningful_words = [w for w in words if w not in stops]
    
#     # 6. Join the words back into one string separated by space, 
#     # and return the result.
#     return(" ".join(meaningful_words))

In [156]:
X = df['text']
y = df['unethical']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [157]:
cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

In [158]:
cvec = CountVectorizer()
params = {
    'cvec__max_features': [None], 
    'cvec__stop_words': [['un', 'they', 'll', 'and']], 
    'cvec__ngram_range': [(1, 1)], 
    'cvec__min_df': [4],
    'cvec__max_df': [.80, .70], 
    'cvec__tokenizer': [None], 
    'lr__solver': ['liblinear', 'sag', 'saga', 'newton-cg',]
}

In [159]:
grid = GridSearchCV(cvec_pipe,
                   params, n_jobs=6,
                   cv=5)

In [160]:
%%time
grid.fit(X_train, y_train)

Wall time: 48.2 s


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [161]:
grid.best_score_

0.8035463288841044

In [162]:
grid.score(X_train, y_train)

0.9539257169722614

In [163]:
grid.score(X_test, y_test)

0.8057626435623615

In [164]:
scores.append(grid.best_params_)

In [165]:
scores.append(grid.best_score_)

In [166]:
scores

[{'cvec__max_df': 0.7,
  'cvec__max_features': 5000,
  'cvec__min_df': 4,
  'cvec__ngram_range': (1, 2),
  'cvec__stop_words': None},
 0.802806720789541,
 {'cvec__max_df': 0.7,
  'cvec__max_features': 1000,
  'cvec__min_df': 5,
  'cvec__ngram_range': (1, 1),
  'cvec__stop_words': None,
  'cvec__tokenizer': None},
 0.7715084508430765,
 {'cvec__max_df': 0.7,
  'cvec__max_features': 1000,
  'cvec__min_df': 5,
  'cvec__ngram_range': (1, 1),
  'cvec__stop_words': ['un', 'they'],
  'cvec__tokenizer': None},
 0.7687550603428615,
 {'cvec__max_df': 0.8,
  'cvec__max_features': 5000,
  'cvec__min_df': 3,
  'cvec__ngram_range': (1, 2),
  'cvec__stop_words': ['un', 'they'],
  'cvec__tokenizer': None},
 0.7710390811308458,
 {'cvec__max_df': 0.7,
  'cvec__max_features': 5000,
  'cvec__min_df': 3,
  'cvec__ngram_range': (1, 2),
  'cvec__stop_words': ['un', 'they', 'll', 'and'],
  'cvec__tokenizer': None},
 0.7702331936834739,
 {'cvec__max_df': 0.8,
  'cvec__max_features': 5000,
  'cvec__min_df': 5,
 

In [167]:
coefficients = grid.best_estimator_.named_steps['lr'].coef_[0]


features = grid.best_estimator_.named_steps['cvec'].get_feature_names()


print(f'There are {len(coefficients)} coefficients.')
print(f'There are {len(features)} features.')


There are 9358 coefficients.
There are 9358 features.


In [168]:
coef_df = pd.DataFrame({'features': features, 
              'coef' : coefficients,
              'exp_coef': [np.exp(coef) for coef in coefficients] #exponentiated coefficients
             })

In [169]:
coef_df = coef_df.set_index('features')
coef_df = coef_df.sort_values('exp_coef', ascending = False)
coef_df.head(10)

Unnamed: 0_level_0,coef,exp_coef
features,Unnamed: 1_level_1,Unnamed: 2_level_1
piss,2.107159,8.224842
fake,1.923127,6.842318
halloween,1.778441,5.920621
rob,1.748464,5.745768
exam,1.741647,5.706737
lie,1.726047,5.618401
fart,1.704332,5.497711
drunk,1.651251,5.213499
murder,1.645804,5.185179
claim,1.610234,5.003981


In [170]:
coef_df.tail(10)

Unnamed: 0_level_0,coef,exp_coef
features,Unnamed: 1_level_1,Unnamed: 2_level_1
tp,-1.460031,0.232229
spelling,-1.492563,0.224796
okay,-1.510168,0.220873
selection,-1.516074,0.219572
germs,-1.523499,0.217948
happier,-1.581056,0.205758
published,-1.628742,0.196176
pets,-1.648059,0.192423
needles,-1.720076,0.179053
unexpectedly,-1.753172,0.173224


In [171]:
cvec.

'word'