# Imports

In [36]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Which stop words do I want to use? -- use native sklearns 
from sklearn.feature_extraction import stop_words
import nltk
from nltk.corpus import stopwords

np.random.seed(42)

In [2]:
df = pd.read_csv('../data/clean_final.csv')
df = df.drop(['index'], axis = 1)
df.head()

Unnamed: 0,Id,CreationDate,Title,Body,Tags,ViewCount,AnswerCount,CommentCount,target_tags,overlap_tags,target_class,body_clean
0,53992219,2019-01-01 00:01:55,How to programmatically change style sheet of ...,<p>I have so many buttons on a dialog and I wa...,"['c++', 'qt', 'qt5', 'qtstylesheets', 'qpushbu...",775,2,2,{'c++'},c++,4,i have so many buttons on a dialog and i want ...
1,53992223,2019-01-01 00:02:37,Unable to print a class list attribute using i...,<p>I am designing a deck class that has <stron...,"['python', 'python-3.x', 'list', 'class', 'pri...",40,2,0,"{'python-3.x', 'python'}",python-3.x python,5,i am designing a deck class that has init meth...
2,53992234,2019-01-01 00:05:48,How to rearrange subplots so that one is under...,<p>I am trying to code two plots such that one...,"['python', 'matplotlib', 'subplot']",519,1,1,{'python'},python,5,i am trying to code two plots such that one pl...
3,53992248,2019-01-01 00:09:24,Function always returns 1,<p>I´m trying to write a simple branch predict...,"['c++', 'function']",150,1,21,{'c++'},c++,4,i m trying to write a simple branch predictor ...
4,53992252,2019-01-01 00:11:20,possible to speed up this query?,<p>I have the following query which takes a li...,"['sql', 'postgresql']",40,1,0,{'sql'},sql,0,i have the following query which takes a littl...


In [3]:
df.isnull().sum()

Id              0
CreationDate    0
Title           0
Body            0
Tags            0
ViewCount       0
AnswerCount     0
CommentCount    0
target_tags     0
overlap_tags    0
target_class    0
body_clean      1
dtype: int64

In [4]:
df[df['body_clean'].isnull()]

Unnamed: 0,Id,CreationDate,Title,Body,Tags,ViewCount,AnswerCount,CommentCount,target_tags,overlap_tags,target_class,body_clean
91170,55234780,2019-03-19 06:21:07,1st data is officeIn and 2nd data is OfficeOut...,"<p><a href=""https://i.stack.imgur.com/P3SNS.pn...","['mysql', 'sql']",26,1,0,"{'sql', 'mysql'}",sql mysql,0,


In [5]:
df.loc[91170, 'Body']

'<p><a href="https://i.stack.imgur.com/P3SNS.png" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/P3SNS.png" alt="enter image description here"></a></p>\n\n<pre><code>1476    5   2019-03-18 09:35:06.000\n1487    5   2019-03-18 13:19:53.000\n1488    5   2019-03-18 13:37:40.000\n1495    5   2019-03-18 15:09:38.000\n1497    5   2019-03-18 15:18:26.000\n1503    5   2019-03-18 17:34:46.000\n1504    5   2019-03-18 17:48:23.000\n1511    5   2019-03-18 19:14:51.000\n</code></pre>\n'

In [6]:
# drop row with missing value in 'body_clean' (technically any missing values)
df = df.dropna()

In [7]:
df.isnull().sum()

Id              0
CreationDate    0
Title           0
Body            0
Tags            0
ViewCount       0
AnswerCount     0
CommentCount    0
target_tags     0
overlap_tags    0
target_class    0
body_clean      0
dtype: int64

# Modeling 

## Model Prep 

In [8]:
X = df['body_clean']
y = df['target_class']

Target Tags: 
- 0 =  SQL
- 1 = Scala
- 2 = R
- 3 = Julia
- 4 = C++
- 5 = Python

### Basline Accuracies 

In [9]:
y.value_counts(normalize = True)

5    0.552299
0    0.211042
2    0.110994
4    0.103900
1    0.019045
3    0.002720
Name: target_class, dtype: float64

In [10]:
X_train,X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.25,
                                                   stratify = y,
                                                   random_state = 42)

### CountVectorizer + Logistic Regression: 
#### Create Pipeline and Gridsearch

In [11]:
# CountVectorizer Pipeline:
# 1. CountVectorizer Instance (transformer)
# 2. LogisticRegression Instance (estimator)

max_iterations = 2_000

pipe_cvec = Pipeline([
    ('cvec',CountVectorizer()),        
    ('lr',LogisticRegression(max_iter = max_iterations,           # To help mitigate convergence warnings
                             solver = 'lbfgs',                    # turns off future warnings 
                             multi_class = 'multinomial'))        # mitigates future warnings  
])

In [12]:
# param grid 

pipe_cvec_params = {
    'cvec__max_features': [400,600],           # was initially [100, 500]  --> increased [400,600]
    'cvec__ngram_range': [(1,2)],              # got rid of (1,1) -- only using (1,2) for now 
    'cvec__stop_words' : ['english']           # can consider adding to stop words dictionary 
    # can decide if I want to mess with 'max_df=1.0' and 'min_df=1' for CVEC params 
    
}

In [13]:
# Instantiate CountVectorizer GridSearchCV
gs_cvec = GridSearchCV(pipe_cvec,
                       param_grid = pipe_cvec_params,
                       cv = 3)

In [14]:
# Fit GridSearch(CVEC) to training data.

gs_cvec.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__max_features': [400, 600], 'cvec__ngram_range': [(1, 2)], 'cvec__stop_words': ['english']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
gs_cvec.best_params_

{'cvec__max_features': 600,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [16]:
# save best model 
gs_cvec_model = gs_cvec.best_estimator_

In [17]:
# CountVectorize Accuracy on training 
gs_cvec_model.score(X_train,y_train)

0.8996387511109837

In [18]:
# CountVectorize Accuracy on testing
gs_cvec_model.score(X_test,y_test)

0.8967608202022982

### TFIDFVectorizer + Logistic Regression: 
#### Create Pipeline and Gridsearch

In [24]:
# TFIDF Vectorizer Pipeline:
# 1. TFIDF Instance (transformer)
# 2. LogisticRegression Instance (estimator)

max_iterations = 2_000

pipe_tvec = Pipeline([
    ('tvec',TfidfVectorizer()),        
    ('lr',LogisticRegression(max_iter = max_iterations,           # To help mitigate convergence warnings
                             solver = 'lbfgs',                    # turns off future warnings 
                             multi_class = 'multinomial'))        # mitigates future warnings  
])

In [25]:
# TFIDF param grid 

pipe_tvec_params = {
    'tvec__max_features': [400,600],           # was initially [100, 500]  --> increased [400,600]
    'tvec__ngram_range': [(1,2)],              # got rid of (1,1) -- only using (1,2) for now 
    'tvec__stop_words' : ['english']           # can consider adding to stop words dictionary 
    # can decide if I want to mess with 'max_df=1.0' and 'min_df=1' for CVEC params 
    
}

In [26]:
# Instantiate CountVectorizer GridSearchCV
gs_tvec = GridSearchCV(pipe_tvec,
                       param_grid = pipe_tvec_params,
                       cv = 3)

In [27]:
# Fit GridSearch(TVEC) to training data.

gs_tvec.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tvec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tvec__max_features': [400, 600], 'tvec__ngram_range': [(1, 2)], 'tvec__stop_words': ['english']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [28]:
gs_tvec.best_params_

{'tvec__max_features': 600,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': 'english'}

In [29]:
# save best model 
gs_tvec_model = gs_tvec.best_estimator_

In [30]:
# TFIDF Accuracy on training 
gs_tvec_model.score(X_train,y_train)

0.9085696264227758

In [32]:
# TFIDF Accuracy on testing
gs_tvec_model.score(X_test,y_test)

0.9071681689947018

### TFIDFVectorizer + Decision Trees: 
#### Create Pipeline and Gridsearch

In [None]:
# TFIDF Vectorizer Pipeline:
# 1. TFIDF Instance (transformer)
# 2. Decision Trees Instance (estimator)

max_iterations = 2_000

pipe_tvec = Pipeline([
    ('tvec',TfidfVectorizer()),        
    ('rf',RandomForestClassifier(max_iter = max_iterations,           # To help mitigate convergence warnings
                             solver = 'lbfgs',                    # turns off future warnings 
                             multi_class = 'multinomial'))        # mitigates future warnings  
])