In [1]:
import numpy as np
import pandas as pd
import datetime
import time
import regex as re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [2]:
df = pd.read_csv('./data/reddit_df.csv')

In [3]:
df.head(2)

Unnamed: 0,text,unethical
0,"if an online video has no controls to skip, p...",0
1,i always judge people's maturity based on the...,0


In [4]:
sia = SentimentIntensityAnalyzer()


In [5]:
def get_compound_sentiment(post):
    return sia.polarity_scores(post)['compound']

In [6]:
df['sentiment'] = df['text'].apply(get_compound_sentiment)


In [7]:
df.head(2)

Unnamed: 0,text,unethical,sentiment
0,"if an online video has no controls to skip, p...",0,-0.296
1,i always judge people's maturity based on the...,0,0.3182


In [8]:
X = df.drop(columns=['unethical'])
y = df['unethical']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
get_text_data = FunctionTransformer(lambda x: x['text'], validate = False)
get_numeric_data = FunctionTransformer(lambda x: x[['sentiment']], validate = False)

In [11]:
tvec = TfidfVectorizer()

In [27]:
union_pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('tvec', TfidfVectorizer())
            ]))
    ])),
    ('lr', LogisticRegression(max_iter=500))
])

params = {
    'features__text_features__tvec__max_features': [10_000], 
    'features__text_features__tvec__stop_words': [['un', 'they', 'll', 'and']], 
    'features__text_features__tvec__ngram_range': [(1, 1)], 
    'features__text_features__tvec__min_df': [4], 
    'features__text_features__tvec__max_df': [.80], 
    'features__text_features__tvec__tokenizer': [None],
}
grid = GridSearchCV(union_pipe,
                   params, n_jobs=6,  
                   cv=5)

In [28]:
%%time
grid.fit(X_train, y_train)

Wall time: 7.26 s


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('numeric_features',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('selector',
                                                                                         FunctionTransformer(accept_sparse=False,
                                                                                                             check_inverse=True,
                                                                                                             func=<function <lambda> at 0x000002A16CB06558>,
                                                                                                  

In [16]:
scores = []

In [29]:
grid.best_score_

0.8120089930569107

In [30]:
grid.score(X_train, y_train)

0.8809859627913225

In [31]:
grid.score(X_test, y_test)

0.8240983276244207

In [32]:
scores.append(grid.best_params_)
scores.append(grid.best_score_)
scores

[{'features__text_features__tvec__max_df': 0.7,
  'features__text_features__tvec__max_features': 1000,
  'features__text_features__tvec__min_df': 4,
  'features__text_features__tvec__ngram_range': (1, 1),
  'features__text_features__tvec__stop_words': ['un', 'they', 'll', 'and'],
  'features__text_features__tvec__tokenizer': None},
 0.7800389735227746,
 {'features__text_features__tvec__max_df': 0.8,
  'features__text_features__tvec__max_features': 1000,
  'features__text_features__tvec__min_df': 4,
  'features__text_features__tvec__ngram_range': (1, 1),
  'features__text_features__tvec__stop_words': ['un', 'they', 'll', 'and'],
  'features__text_features__tvec__tokenizer': None},
 0.779367201375759,
 {'features__text_features__tvec__max_df': 0.8,
  'features__text_features__tvec__max_features': 10000,
  'features__text_features__tvec__min_df': 4,
  'features__text_features__tvec__ngram_range': (1, 1),
  'features__text_features__tvec__stop_words': ['un', 'they', 'll', 'and'],
  'featur

In [41]:
features = ['sentiment_score'] +\
grid.best_estimator_.named_steps['features'].transformer_list[1][1].named_steps['tvec'].get_feature_names()
coefficients = grid.best_estimator_.named_steps['lr'].coef_[0]


In [42]:

coef_df = pd.DataFrame({'features': features, 
              'coef' : coefficients,
              'exp_coef': [np.exp(coef) for coef in coefficients] #exponentiated coefficients
             })

coef_df

Unnamed: 0,features,coef,exp_coef
0,sentiment_score,-0.195954,0.822050
1,00,0.181022,1.198442
2,000,-0.136978,0.871989
3,01,-0.131910,0.876420
4,02,0.161400,1.175155
...,...,...,...
9354,zipper,-0.391761,0.675865
9355,zone,-0.319024,0.726858
9356,zones,-0.089793,0.914120
9357,zoo,0.143018,1.153751


In [55]:
coef_df.sort_values(by=["exp_coef"], ascending=False).head(10)

Unnamed: 0,features,coef,exp_coef
3441,free,5.856244,349.409134
3133,fake,4.526203,92.406989
9025,want,4.471851,87.518602
6828,request,4.432128,84.110226
8252,tell,4.173128,64.918218
6925,return,3.913287,50.063232
3580,get,3.561524,35.216816
1579,claim,3.379069,29.343443
7122,say,3.376865,29.278836
7611,so,3.13629,23.018313
