In [27]:
# import libraries 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
import numpy as np
from scipy.stats import uniform
import warnings
warnings.filterwarnings('ignore')

In [2]:
posts_df = pd.read_pickle('../data/interum/text_target.pkl')

In [3]:
posts_df.head(2)

Unnamed: 0,id,body,tags,text,target,cleaned_text,tokens
0,111102,<p>How would you explain JavaScript closures t...,javascript|scope|closures,\n\nHow would you explain JavaScript closures ...,javascript,would explain closure someone knowledge concep...,"[would, explain, closure, someone, knowledge, ..."
1,1642028,<p>After reading <a href= http://groups.google...,c++|c|operators|code-formatting|standards-comp...,\n\nAfter reading Hidden Features and Dark Cor...,c++,read hidden feature dark corner c stl comp lan...,"[read, hidden, feature, dark, corner, c, stl, ..."


In [4]:
# convert into features and target 
feature = posts_df['cleaned_text']
label = posts_df['target']

In [5]:
# split into train test
train_X, test_X, train_y, test_y = train_test_split(
    feature, label, stratify = label, test_size = 0.2, random_state = 42)

## tfidVector Feature

In [6]:
# try tfidVector 
tfid_vect = TfidfVectorizer()
tfid_vect.fit_transform(train_X)

<24241x57072 sparse matrix of type '<class 'numpy.float64'>'
	with 949219 stored elements in Compressed Sparse Row format>

In [7]:
# transform the training and test sets 
x_train_tfidf = tfid_vect.transform(train_X)
x_test_tfidf =tfid_vect.transform(test_X)

In [8]:
print(tfid_vect.get_feature_names)

<bound method CountVectorizer.get_feature_names of TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)>


In [59]:
# initlaize all models
rf = RandomForestClassifier(random_state=42)
lg = LogisticRegression(random_state=42)
svc = SVC(random_state=42)
nb = MultinomialNB()
# xg = XGBClassifier(random_state = 42)
ada = AdaBoostClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

In [66]:
# try vanilla models 
for base_clf in (rf,lg,svc,nb,ada,gb):
        clf = OneVsRestClassifier(base_clf)
        cv_results = cross_validate(clf,x_train_tfidf,train_y, cv = 3,n_jobs=-1,return_train_score= True)
        print(base_clf)
        print('test:', cv_results['test_score'])
        print('train:', cv_results['train_score'])


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
test: [0.74285361 0.74569979 0.74811239]
train: [0.9977104  0.99740099 0.99733944]
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
test: [0.80076723 0.79940601 0.79848991]
train: [0.88768564 0.88935644 0.89419626]
SVC(C=1.0, cache_si

## we will focus on:

* MultinomialNB
* RandomForest
* LogisticRegression

## hyperparameter for logistic regression 

In [64]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf',lg ),
])
# Create regularization hyperparameter distribution using uniform distribution
C =uniform(loc=0, scale=4)
max_df = np.linspace(0.3,0.75,num = 10)
min_df = np.arange(1,16,2)
max_features = np.arange(5000,10000,500)
parameters = {
        'vect__ngram_range':((1,1),(1,2)),
        'vect__max_df': max_df,
        'vect__min_df':min_df,
        'vect__max_features':max_features,
        'clf': (lg,),
        'clf__penalty': ('l1','l2'),
        'clf__C': C}
clf_lg = RandomizedSearchCV(pipeline, parameters,random_state = 42, n_iter=100, cv = 3, n_jobs=14)

In [65]:
clf_lg.fit(train_X,train_y)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('vect',
                                              TfidfVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.float64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=None,
                                                              min_df=1,
                                        

In [66]:
clf_lg.best_params_

{'clf': LogisticRegression(C=2.343102325093853, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=42, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False),
 'clf__C': 2.343102325093853,
 'clf__penalty': 'l1',
 'vect__max_df': 0.45,
 'vect__max_features': 8500,
 'vect__min_df': 7,
 'vect__ngram_range': (1, 2)}

In [67]:
clf_lg.best_score_

0.8127965017944804

In [74]:
# finer search
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', lg),
])
parameters = {
        'vect__ngram_range':((1,2),),
        'vect__max_df': (0.4,0.5),
        'vect__min_df':(6,8),
        'vect__max_features':(8000,9000),
        'clf': (lg,),
        'clf__penalty': ('l1','l2'),
        'clf__C': (2,3,4)}
grid_search_lg = GridSearchCV(pipeline, parameters, cv = 3, return_train_score= True,n_jobs=14)

In [75]:
grid_search_lg.fit(train_X,train_y)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [76]:
grid_search_lg.best_params_

{'clf': LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=42, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False),
 'clf__C': 2,
 'clf__penalty': 'l1',
 'vect__max_df': 0.5,
 'vect__max_features': 9000,
 'vect__min_df': 6,
 'vect__ngram_range': (1, 2)}

In [84]:
grid_search_lg.score(train_X,train_y)

0.8929912132337775

In [83]:
grid_search_lg.score(test_X,test_y)

0.8201616894901831

In [117]:
classes = grid_search_lg.classes_

In [109]:
coefs = grid_search_lg.best_estimator_['clf'].coef_

In [118]:
coefs.shape

(5, 9000)

In [114]:
featurenames = grid_search_lg.best_estimator_['vect'].get_feature_names()

In [123]:
coef_dict={}
for i, cls in enumerate(classes):
    coef_dict[cls]=[]
    for c, f in zip(coefs[i],featurenames):
        if c:
            coef_dict[cls].append((f,c))

In [128]:
sorted(coef_dict['c#'],key=lambda x: x[1],reverse = True)[:10]

[('writeline', 34.35443542592411),
 ('net', 14.409952259671503),
 ('msdn', 13.406405098449603),
 ('ienumerable', 12.597638498674106),
 ('script jquery', 12.258235070200952),
 ('streamreader', 12.207305885657282),
 ('linq', 12.060271676062444),
 ('window form', 11.696983129235713),
 ('winforms', 11.34889725246229),
 ('entity framework', 11.139816376850316)]

In [129]:
sorted(coef_dict['c#'],key=lambda x: x[1])[:10]

[('println', -29.522974297537914),
 ('django', -17.091895906609306),
 ('pythonic', -16.322255328863903),
 ('spring', -16.15729795551023),
 ('sun', -15.73658807129685),
 ('def', -15.077989494940644),
 ('jquery', -15.06177845407277),
 ('std', -13.810242160767265),
 ('gcc', -13.09393849381091),
 ('jvm', -12.65241004520841)]

## hyperparmater for RandomForest 

In [135]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf',rf),
])
# Create regularization hyperparameter distribution using uniform distribution
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=800, num=10)]
# Number of features to consider at every split
mf = ['auto', 'sqrt']
# Minimum number of samples required to split a node
min_samples_split = [10, 20, 30]
max_df = np.linspace(0.3,0.75,num = 10)
min_df = np.arange(1,16,2)
max_features = np.arange(5000,10000,500)
parameters = {
        'vect__ngram_range':((1,1),(1,2)),
        'vect__max_df': max_df,
        'vect__min_df':min_df,
        'vect__max_features':max_features,
        'clf': (rf,),
        'clf__n_estimators': n_estimators,
        'clf__max_features': mf,
        'clf__min_samples_split': min_samples_split}
clf_rf = RandomizedSearchCV(pipeline, parameters,random_state = 42, n_iter=100, cv = 3, n_jobs=14)

In [None]:
clf_rf.fit(train_X,train_y)

In [13]:
rf_random.best_params_

{'n_estimators': 200, 'min_samples_split': 10, 'max_features': 'auto'}

## multinominalNB 

In [23]:
# Create regularization penalty space


# Create regularization hyperparameter distribution using uniform distribution
alpha =uniform(loc=0, scale=4)

# Create hyperparameter options
hyperparameters = dict(alpha = alpha)
clf = RandomizedSearchCV(nb, hyperparameters, random_state= 42,
                         n_iter=100, cv=5, verbose=0, n_jobs=-1)

clf.fit(x_train_tfidf,train_y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                           fit_prior=True),
                   iid='warn', n_iter=100, n_jobs=-1,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f177db7f4e0>},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [24]:
clf.best_params_

{'alpha': 0.08233797718320979}

In [None]:
## Find best tfidVector using naivebaye model 


In [48]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', lg),
])
parameters = {
        'vect__ngram_range':((1,1),(1,2)),
        'vect__max_df': (0.3,0.5, 0.75),
        'vect__min_df':(5, 10,15),
        'vect__max_features':(4000,5000,6000),
        'clf': (lg,),
        'clf__penalty': ('l1',),
        'clf__C': (3.895,)}
grid_search = GridSearchCV(pipeline, parameters, cv = 3, return_train_score= True)

In [49]:
grid_search.fit(train_X,train_y)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [50]:
grid_search.best_params_

{'clf': LogisticRegression(C=3.895, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=42, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False),
 'clf__C': 3.895,
 'clf__penalty': 'l1',
 'vect__max_df': 0.5,
 'vect__max_features': 6000,
 'vect__min_df': 5,
 'vect__ngram_range': (1, 1)}

In [51]:
grid_search.best_score_

0.8062786188688585

In [35]:
x_train_tfidf.shape

(24241, 57072)