# Imports

In [10]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Which stop words do I want to use? -- use native sklearns 
from sklearn.feature_extraction import stop_words
import nltk
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('../data/clean_final.csv')
df = df.drop(['index'], axis = 1)
df.head()

Unnamed: 0,Id,CreationDate,Title,Body,Tags,ViewCount,AnswerCount,CommentCount,target_tags,overlap_tags,target_class,body_clean
0,53992219,2019-01-01 00:01:55,How to programmatically change style sheet of ...,<p>I have so many buttons on a dialog and I wa...,"['c++', 'qt', 'qt5', 'qtstylesheets', 'qpushbu...",775,2,2,{'c++'},c++,4,i have so many buttons on a dialog and i want ...
1,53992223,2019-01-01 00:02:37,Unable to print a class list attribute using i...,<p>I am designing a deck class that has <stron...,"['python', 'python-3.x', 'list', 'class', 'pri...",40,2,0,"{'python-3.x', 'python'}",python-3.x python,5,i am designing a deck class that has init meth...
2,53992234,2019-01-01 00:05:48,How to rearrange subplots so that one is under...,<p>I am trying to code two plots such that one...,"['python', 'matplotlib', 'subplot']",519,1,1,{'python'},python,5,i am trying to code two plots such that one pl...
3,53992248,2019-01-01 00:09:24,Function always returns 1,<p>I´m trying to write a simple branch predict...,"['c++', 'function']",150,1,21,{'c++'},c++,4,i m trying to write a simple branch predictor ...
4,53992252,2019-01-01 00:11:20,possible to speed up this query?,<p>I have the following query which takes a li...,"['sql', 'postgresql']",40,1,0,{'sql'},sql,0,i have the following query which takes a littl...


# Modeling 

## Model Prep 

In [3]:
X = df['body_clean']
y = df['target_class']

Target Tags: 
- 0 =  SQL
- 1 = Scala
- 2 = R
- 3 = Julia
- 4 = C++
- 5 = Python

### Basline Accuracies 

In [6]:
y.value_counts(normalize = True)

5    0.552298
0    0.211044
2    0.110993
4    0.103900
1    0.019045
3    0.002720
Name: target_class, dtype: float64

In [9]:
X_train,X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.25,
                                                   stratify = y,
                                                   random_state = 42)

### CountVectorizer + Logistic Regression: 
#### Create Pipeline and Gridsearch

In [15]:
# CountVectorizer Pipeline:
# 1. CountVectorizer Instance (transformer)
# 2. LogisticRegression Instance (estimator)

pipe_cvec = Pipeline([
    ('cvec',CountVectorizer()),        
    ('lr',LogisticRegression())         
])

In [16]:
#CountVectorizer Parameters 
# features = [100,500]
# english stop words = included & removed 
# ngram range = [the default ngram_range = (1,1), ngram_range that includes 1 and 2 = (1,2)

pipe_cvec_params = {
    'cvec__max_features': [100,500],
    'cvec__ngram_range': [(1,1),(1,2)],
    'cvec__stop_words' : [None,'english']
    
}

In [17]:
# Instantiate CountVectorizer GridSearchCV
gs_cvec = GridSearchCV(pipe_cvec,
                       param_grid = pipe_cvec_params,
                       cv = 3)

In [18]:
# Fit GridSearch(CVEC) to training data.
gs_cvec.fit(X_train,y_train)





GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__max_features': [100, 500], 'cvec__ngram_range': [(1, 1), (1, 2)], 'cvec__stop_words': [None, 'english']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
gs_cvec.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [20]:
# save best model 
gs_cvec_model = gs_cvec.best_estimator_

In [21]:
# CountVectorize Accuracy on training 
gs_cvec_model.score(X_train,y_train)

0.8941400437511289

In [23]:
# CountVectorize Accuracy on testing
gs_cvec_model.score(X_test,y_test)

ValueError: np.nan is an invalid document, expected byte or unicode string.