In [22]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

amazon_link = '../Data/amazon_phone.pkl'

# ToDo: reference to module
# from Scripts import loading as dl
# df = dl.load_sampled(amazon_link, 5000)
def load_sampled(link, per_class):
    df = pd.read_pickle(link)
    df_1 = df[df['label'] == 1.0].values.tolist()
    df_2 = df[df['label'] == 2.0].values.tolist()
    df_3 = df[df['label'] == 3.0].values.tolist()
    df_4 = df[df['label'] == 4.0].values.tolist()
    df_5 = df[df['label'] == 5.0].values.tolist()

    try:
        random.seed(123)
        adf1 = random.sample(df_1, per_class)
    except ValueError:
        random.seed(123)
        adf1 = random.choices(df_1, k=per_class)
    try:
        random.seed(123)
        adf2 = random.sample(df_2, per_class)
    except ValueError:
        random.seed(123)
        adf2 = random.choices(df_2, k=per_class)
    try:
        random.seed(123)
        adf3 = random.sample(df_3, per_class)
    except ValueError:
        random.seed(123)
        adf3 = random.choices(df_3, k=per_class)
    try:
        random.seed(123)
        adf4 = random.sample(df_4, per_class)
    except ValueError:
        random.seed(123)
        adf4 = random.choices(df_4, k=per_class)
    try:
        random.seed(123)
        adf5 = random.sample(df_5, per_class)
    except ValueError:
        random.seed(123)
        adf5 = random.choices(df_5, k=per_class)
    adf11 = pd.DataFrame(adf1)
    adf12 = pd.DataFrame(adf2)
    adf13 = pd.DataFrame(adf3)
    adf14 = pd.DataFrame(adf4)
    adf15 = pd.DataFrame(adf5)
    df_all = pd.concat([adf11, adf12, adf13, adf14, adf15], ignore_index=True)
    df_all = df_all[[2, 1]]
    df_all.columns = ['text_prep', 'label']
    print(f'{per_class} reviews per class from {link} loaded')
    return df_all


df = load_sampled(amazon_link, 5000)
df = df.sample(frac=0.01, random_state=123)
print(df.label.value_counts())
target = df.label
text = df.text_prep

X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.3, random_state=None)

count = TfidfVectorizer()
param_grid = [{
    'vect__max_df': [0.5, 0.75, 0.8, 0.9, 1.0],
    'vect__min_df': [1, 2, 3, 5, 10, 20],
    'vect__binary': [True, False]
}, {
    'vect': [CountVectorizer(),],
    'vect__max_df': [0.5, 0.75, 0.8, 0.9, 1.0],
    'vect__min_df': [1, 2, 3, 5, 10],

}]
cv = StratifiedKFold(n_splits = 5, shuffle=False, random_state=123)
ml_features = Pipeline([('vect', count),
                        ('clf', LogisticRegression(C=10.0, multi_class='multinomial', penalty='l2', solver='saga', random_state=123))])

gs_ml_features = GridSearchCV(ml_features, param_grid, scoring='f1_macro', cv=cv, verbose=3, n_jobs=-1)
gs_ml_features.fit(X_train, y_train)
print('best parameters')
print(gs_ml_features.best_params_)
print('best score')
print(gs_ml_features.best_score_)


pd.concat([pd.DataFrame(gs_ml_features.cv_results_["params"]),pd.DataFrame(gs_ml_features.cv_results_["mean_test_score"], columns=["f1_macro"])],axis=1)



5000 reviews per class from ../Data/amazon_phone.pkl loaded
3.0    54
5.0    54
4.0    53
1.0    45
2.0    44
Name: label, dtype: int64
Fitting 5 folds for each of 85 candidates, totalling 425 fits
best parameters
{'vect': CountVectorizer(max_df=0.75, min_df=5), 'vect__max_df': 0.75, 'vect__min_df': 5}
best score
0.35961567843920783


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 410 out of 425 | elapsed:    4.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 425 out of 425 | elapsed:    4.8s finished


Unnamed: 0,vect__binary,vect__max_df,vect__min_df,vect,f1_macro
0,True,0.5,1,,0.319691
1,True,0.5,2,,0.305820
2,True,0.5,3,,0.269140
3,True,0.5,5,,0.277963
4,True,0.5,10,,0.250416
...,...,...,...,...,...
80,,1.0,1,"CountVectorizer(max_df=0.75, min_df=5)",0.346092
81,,1.0,2,"CountVectorizer(max_df=0.75, min_df=5)",0.341126
82,,1.0,3,"CountVectorizer(max_df=0.75, min_df=5)",0.352759
83,,1.0,5,"CountVectorizer(max_df=0.75, min_df=5)",0.359616
