In [6]:
import pdb

import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# logistic regression, naivebayes classifier, RF classifier, 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# model selection/enhancements
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Loading in the data

In [3]:
with open('../data/interim/text_target.pkl', 'rb') as f:
    text_target = pickle.load(f)

# Classification weights

In [69]:
total = text_target.target.value_counts().sum()
java_weight = text_target.target.value_counts().java / total
c_sharp_weight = text_target.target.value_counts()["c#"] / total
javascript_weight = text_target.target.value_counts().javascript / total
python_weight = text_target.target.value_counts().python / total
c_plus_weight = text_target.target.value_counts()["c++"] / total

weights = {
    "java": java_weight, 
    "c#": c_sharp_weight,
    "javascript": javascript_weight,
    "python": python_weight,
    "c++": c_plus_weight
}

# Train test split

In [4]:
X = text_target.cleaned_text
y = text_target.target

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# Pipeline Setup - LogisticRegression

In [149]:
steps = [
    ("vec", TfidfVectorizer()),
    ("lg", LogisticRegression(random_state=42,
                              class_weight=weights, 
                              penalty="l2", 
                              multi_class="multinomial",
                              C=16,
                              warm_start=True,
                              n_jobs=-1
                             ))
]

pipe = Pipeline(steps, verbose=2)

params = {
    "lg__solver": ['lbfgs'],
    "lg__fit_intercept": [True, False],
    "lg__max_iter": [50]
}

grid = GridSearchCV(
    estimator=pipe, param_grid=params, cv=3, scoring="accuracy", n_jobs=-1, verbose=2)

In [151]:
grid.fit(X_train, y_train);

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   18.2s remaining:   18.2s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   18.6s finished


[Pipeline] ............... (step 1 of 2) Processing vec, total=   0.9s
[Pipeline] ................ (step 2 of 2) Processing lg, total=   4.1s


In [152]:
train_score = grid.best_estimator_.score(X_train, y_train)
test_score = grid.best_estimator_.score(X_test, y_test)
best_score = grid.best_score_
best_params = grid.best_params_

print(train_score)
print(test_score)
print(best_score)
print(best_params)

0.9357287240625387
0.8133971291866029
0.7998432407903965
{'lg__fit_intercept': False, 'lg__max_iter': 50, 'lg__solver': 'lbfgs'}


# Run Vectorization on Texts 

In [5]:
vectorizer = TfidfVectorizer()

X_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create Models

In [70]:
lg = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
nb = MultinomialNB()

# Gridsearch Models

In [38]:
def make_gridsearch_model(name, model, params={}):
        return {
            "name": name,
            "model": model,
            "params": params
        }, 

In [83]:
def run_gridsearch(data, estimators, cv=3):

    for estimator in estimators:
        estimator = estimator[0]
        grid = GridSearchCV(estimator=estimator["model"], 
                            param_grid=estimator['params'], 
                            cv=cv, n_jobs=-1, scoring="balanced_accuracy")

        grid.fit(data["X_train"], data["y_train"])

        score = grid.best_estimator_.score(data["X_test"], data["y_test"])
        print("Best Params:", grid.best_params_)
        print(f"{estimator['name']}:\t{score}")

In [90]:
estimators = [
#     make_gridsearch_model("svc", svc),
#     make_gridsearch_model("lg", lg),
    make_gridsearch_model("rf", rf, 
                          {
#                               "max_depth": [x for x in range(10, 51)],
                              "n_estimators": [x for x in range(10, 100, 10)]
                          }),
#     make_gridsearch_model("nb", nb),
]

data = {
    "X_train": X_vectorized,
    "X_test": X_test_vectorized,
    "y_train": y_train,
    "y_test": y_test
}

run_gridsearch(data, estimators=estimators)

Best Params: {'n_estimators': 90}
rf:	0.7909585876918


# Check OneVsRest Performance on Models

In [68]:
ovr = OneVsRestClassifier(estimator=lg)

ovr.fit(X_vectorized, y_train)

ovr.score(X_test_vectorized, y_test)



0.8120772149810263