In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

path = '/Users/tomcio/Documents/GitHub/MIT_MBAn_NER/data/'
data = pd.read_csv(path + 'training_data_RAW.csv')

# Assuming 'data' is your DataFrame and it's already loaded
X = data["Name"]
y = data["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char')),
    ('clf', LogisticRegression(solver='liblinear')),  # Using 'liblinear' solver for binary classification
])

# Define the parameter grid to search over
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3), (1, 4), (2, 4), (1, 5), (2, 5), (3, 5), (1, 6), (2, 6), (3, 6), (4, 6)],
    # You can add more parameters here to test other aspects of the pipeline
}


# Setup the grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters set found on development set:")
print(grid_search.best_params_)
print("Grid scores on development set:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

# Evaluate the best grid-searched model on the test data
y_true, y_pred = y_test, grid_search.predict(X_test)
print(classification_report(y_true, y_pred))


Best parameters set found on development set:
{'vect__ngram_range': (1, 5)}
Grid scores on development set:
0.260 (+/-0.025) for {'vect__ngram_range': (1, 1)}
0.756 (+/-0.019) for {'vect__ngram_range': (1, 2)}
0.824 (+/-0.015) for {'vect__ngram_range': (1, 3)}
0.819 (+/-0.013) for {'vect__ngram_range': (2, 3)}
0.836 (+/-0.016) for {'vect__ngram_range': (1, 4)}
0.831 (+/-0.016) for {'vect__ngram_range': (2, 4)}
0.839 (+/-0.011) for {'vect__ngram_range': (1, 5)}
0.834 (+/-0.017) for {'vect__ngram_range': (2, 5)}
0.808 (+/-0.020) for {'vect__ngram_range': (3, 5)}
0.838 (+/-0.016) for {'vect__ngram_range': (1, 6)}
0.831 (+/-0.019) for {'vect__ngram_range': (2, 6)}
0.803 (+/-0.022) for {'vect__ngram_range': (3, 6)}
0.731 (+/-0.025) for {'vect__ngram_range': (4, 6)}
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     13975
           1       0.90      0.78      0.84      1593

    accuracy                           0.97     15568
   macro av