In [1]:
import pdb

import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# logistic regression, naivebayes classifier, RF classifier, 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# model selection/enhancements
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Loading in the data

In [2]:
with open('../data/interim/text_target.pkl', 'rb') as f:
    text_target = pickle.load(f)

# Classification weights

In [20]:
total = text_target.target.value_counts().sum()
java_weight = text_target.target.value_counts().java / total
c_sharp_weight = text_target.target.value_counts()["c#"] / total
javascript_weight = text_target.target.value_counts().javascript / total
python_weight = text_target.target.value_counts().python / total
c_plus_weight = text_target.target.value_counts()["c++"] / total

weights = {
    "java": java_weight, 
    "c#": c_sharp_weight,
    "javascript": javascript_weight,
    "python": python_weight,
    "c++": c_plus_weight
}
weights

{'java': 0.24651838162497525,
 'c#': 0.230545838558511,
 'javascript': 0.2291597914329087,
 'python': 0.16583063824170022,
 'c++': 0.12794535014190483}

# Train test split

In [4]:
X = text_target.cleaned_text
y = text_target.target

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# Pipeline - LogisticRegression

In [35]:
steps = [
    ("vec", TfidfVectorizer()),
    ("lg", LogisticRegression(random_state=42,
                              class_weight=weights, 
                              penalty="l2", 
                              multi_class="multinomial",
                              C=16,
                              warm_start=True,
                              n_jobs=-1
                             ))
]

pipe_lg = Pipeline(steps, verbose=2)

params = {
    "lg__solver": ['lbfgs'],
    "lg__fit_intercept": [True, False],
    "lg__max_iter": [50]
}

grid = GridSearchCV(
    estimator=pipe_lg, param_grid=params, cv=3, scoring="accuracy", n_jobs=-1, verbose=2)

In [36]:
grid.fit(X_train, y_train)

train_score = grid.best_estimator_.score(X_train, y_train)
test_score = grid.best_estimator_.score(X_test, y_test)
best_score = grid.best_score_
best_params = grid.best_params_

print("Train Score:\t", train_score)
print("Test Score:\t", test_score)
print("Best Score:\t" ,best_score)
print("Best Params:\t", best_params)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   18.7s remaining:   18.7s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   19.0s finished


[Pipeline] ............... (step 1 of 2) Processing vec, total=   0.8s
[Pipeline] ................ (step 2 of 2) Processing lg, total=   4.0s
Train Score:	 0.9357287240625387
Test Score:	 0.8133971291866029
Best Score:	 0.7998432407903965
Best Params:	 {'lg__fit_intercept': False, 'lg__max_iter': 50, 'lg__solver': 'lbfgs'}


# Pipeline - Naive Bayes

In [33]:
steps = [
    ("vec", TfidfVectorizer()),
    ("nb", MultinomialNB())
]

pipe_nb = Pipeline(steps, verbose=2)

params = {
    "nb__fit_prior": [True, False],
    "nb__alpha": [0.01, 0.1, 1, 5, 10]
    
}

grid_nb = GridSearchCV(pipe_nb, params, cv=3, scoring="accuracy", n_jobs=-1, verbose=2, error_score=0.0)

In [34]:
grid_nb.fit(X_train, y_train)

train_score = grid_nb.best_estimator_.score(X_train, y_train)
test_score = grid_nb.best_estimator_.score(X_test, y_test)
best_score = grid_nb.best_score_
best_params = grid_nb.best_params_

print("Train Score:\t", train_score)
print("Test Score:\t", test_score)
print("Best Score:\t" ,best_score)
print("Best Params:\t", best_params)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.0s finished


[Pipeline] ............... (step 1 of 2) Processing vec, total=   0.8s
[Pipeline] ................ (step 2 of 2) Processing nb, total=   0.1s
Train Score:	 0.9083783672290747
Test Score:	 0.8130671506352087
Best Score:	 0.8023183862051896
Best Params:	 {'nb__alpha': 0.1, 'nb__fit_prior': False}


# Pipeline - Random Forest

In [118]:
steps = [
    ("vec", TfidfVectorizer()),
    ("rf", RandomForestClassifier(random_state=42))
]

pipe_rf = Pipeline(steps, verbose=2)

params = {
    "rf__criterion": ["gini"],
    "rf__max_depth": [1500],
    "rf__max_leaf_nodes": [1000],
    "rf__min_samples_split": [2],
    "rf__min_samples_leaf": [2],
    "rf__n_estimators": [1000],
#     "rf__class_weight": [None, weights],
}

grid_rf = GridSearchCV(pipe_rf, params, cv=3, scoring="accuracy",
                       n_jobs=-1, verbose=2, error_score=0.0)

In [None]:
grid_rf.fit(X_train, y_train)

train_score = grid_rf.best_estimator_.score(X_train, y_train)
test_score = grid_rf.best_estimator_.score(X_test, y_test)
best_score = grid_rf.best_score_
best_params = grid_rf.best_params_

print("Train Score:\t", train_score)
print("Test Score:\t", test_score)
print("Best Score:\t" ,best_score)
print("Best Params:\t", best_params)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   51.5s finished


[Pipeline] ............... (step 1 of 2) Processing vec, total=   0.8s
