In [2]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, SVR
from sklearn.pipeline import Pipeline

amazon_link = '../Data/amazon_phone.pkl'

# ToDo: reference to module
# from Scripts import loading as dl
# df = dl.load_sampled(amazon_link, 5000)
def load_sampled(link, per_class):
    df = pd.read_pickle(link)
    df_1 = df[df['label'] == 1.0].values.tolist()
    df_2 = df[df['label'] == 2.0].values.tolist()
    df_3 = df[df['label'] == 3.0].values.tolist()
    df_4 = df[df['label'] == 4.0].values.tolist()
    df_5 = df[df['label'] == 5.0].values.tolist()

    try:
        random.seed(123)
        adf1 = random.sample(df_1, per_class)
    except ValueError:
        random.seed(123)
        adf1 = random.choices(df_1, k=per_class)
    try:
        random.seed(123)
        adf2 = random.sample(df_2, per_class)
    except ValueError:
        random.seed(123)
        adf2 = random.choices(df_2, k=per_class)
    try:
        random.seed(123)
        adf3 = random.sample(df_3, per_class)
    except ValueError:
        random.seed(123)
        adf3 = random.choices(df_3, k=per_class)
    try:
        random.seed(123)
        adf4 = random.sample(df_4, per_class)
    except ValueError:
        random.seed(123)
        adf4 = random.choices(df_4, k=per_class)
    try:
        random.seed(123)
        adf5 = random.sample(df_5, per_class)
    except ValueError:
        random.seed(123)
        adf5 = random.choices(df_5, k=per_class)
    adf11 = pd.DataFrame(adf1)
    adf12 = pd.DataFrame(adf2)
    adf13 = pd.DataFrame(adf3)
    adf14 = pd.DataFrame(adf4)
    adf15 = pd.DataFrame(adf5)
    df_all = pd.concat([adf11, adf12, adf13, adf14, adf15], ignore_index=True)
    df_all = df_all[[2, 1]]
    df_all.columns = ['text_prep', 'label']
    print(f'{per_class} reviews per class from {link} loaded')
    return df_all


df = load_sampled(amazon_link, 5000)
df = df.sample(frac=0.01)
target = df.label
text = df.text_prep

X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.3, random_state=None)

tfidf = TfidfVectorizer()

param_grid = [{
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
    'vect__max_df': [0.5, 0.75, 0.8, 0.9, 1.0],
    'vect__min_df': [1, 2, 3, 5, 10, 20],
    'vect__binary': [True, False]
}, {
    'vect': [CountVectorizer(),],
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
    'vect__max_df': [0.5, 0.75, 0.8, 0.9, 1.0],
    'vect__min_df': [1, 2, 3, 5, 10],

}]
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
print('######## RUN SVC')
svc_features = Pipeline([('vect', tfidf),
                        ('clf', SVC(C=1.0, decision_function_shape='ovo', gamma='auto', kernel='linear', random_state=123))])
gs_svc_features = GridSearchCV(svc_features, param_grid, scoring='f1_macro', cv=cv, verbose=3, n_jobs=-1)
gs_svc_features.fit(X_train, y_train)
print('best parameters')
print(gs_svc_features.best_params_)
print('best score')
print(gs_svc_features.best_score_)
print(pd.concat([pd.DataFrame(gs_svc_features.cv_results_["params"]), pd.DataFrame(gs_svc_features.cv_results_["mean_test_score"], columns=["f1_macro"])], axis=1))


print('######## RUN LOGISTIC REGRESSION')
lr_features = Pipeline([('vect', tfidf),
                        ('clf', LogisticRegression(C=10.0, multi_class='multinomial', penalty='l2', solver='saga', random_state=123))])

gs_lr_features = GridSearchCV(lr_features, param_grid, scoring='f1_macro', cv=cv, verbose=3, n_jobs=-1)
gs_lr_features.fit(X_train, y_train)
print('best parameters')
print(gs_lr_features.best_params_)
print('best score')
print(gs_lr_features.best_score_)
print(pd.concat([pd.DataFrame(gs_lr_features.cv_results_["params"]),pd.DataFrame(gs_lr_features.cv_results_["mean_test_score"], columns=["f1_macro"])],axis=1))

5000 reviews per class from ../Data/amazon_phone.pkl loaded
######## RUN SVC
Fitting 5 folds for each of 255 candidates, totalling 1275 fits
best parameters
{'vect': CountVectorizer(max_df=0.75, min_df=3, ngram_range=(1, 3)), 'vect__max_df': 0.75, 'vect__min_df': 3, 'vect__ngram_range': (1, 3)}
best score
0.2608985994935652
    vect__binary  vect__max_df  vect__min_df vect__ngram_range  \
0           True           0.5             1            (1, 1)   
1           True           0.5             1            (1, 2)   
2           True           0.5             1            (1, 3)   
3           True           0.5             2            (1, 1)   
4           True           0.5             2            (1, 2)   
..           ...           ...           ...               ...   
250          NaN           1.0             5            (1, 2)   
251          NaN           1.0             5            (1, 3)   
252          NaN           1.0            10            (1, 1)   
253          N

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 864 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 1275 out of 1275 | elapsed:   17.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 1260 out of 1275 | elapsed:   21.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 1275 out of 1275 | elapsed:   22.0s finished


In [None]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif, f_regression, mutual_info_classif, mutual_info_regression
from sklearn.pipeline import Pipeline


amazon_link = '../Data/amazon_phone.pkl'

# ToDo: reference to module
# from Scripts import loading as dl
# df = dl.load_sampled(amazon_link, 5000)
def load_sampled(link, per_class):
    df = pd.read_pickle(link)
    df_1 = df[df['label'] == 1.0].values.tolist()
    df_2 = df[df['label'] == 2.0].values.tolist()
    df_3 = df[df['label'] == 3.0].values.tolist()
    df_4 = df[df['label'] == 4.0].values.tolist()
    df_5 = df[df['label'] == 5.0].values.tolist()

    try:
        random.seed(123)
        adf1 = random.sample(df_1, per_class)
    except ValueError:
        random.seed(123)
        adf1 = random.choices(df_1, k=per_class)
    try:
        random.seed(123)
        adf2 = random.sample(df_2, per_class)
    except ValueError:
        random.seed(123)
        adf2 = random.choices(df_2, k=per_class)
    try:
        random.seed(123)
        adf3 = random.sample(df_3, per_class)
    except ValueError:
        random.seed(123)
        adf3 = random.choices(df_3, k=per_class)
    try:
        random.seed(123)
        adf4 = random.sample(df_4, per_class)
    except ValueError:
        random.seed(123)
        adf4 = random.choices(df_4, k=per_class)
    try:
        random.seed(123)
        adf5 = random.sample(df_5, per_class)
    except ValueError:
        random.seed(123)
        adf5 = random.choices(df_5, k=per_class)
    adf11 = pd.DataFrame(adf1)
    adf12 = pd.DataFrame(adf2)
    adf13 = pd.DataFrame(adf3)
    adf14 = pd.DataFrame(adf4)
    adf15 = pd.DataFrame(adf5)
    df_all = pd.concat([adf11, adf12, adf13, adf14, adf15], ignore_index=True)
    df_all = df_all[[0, 1]]
    df_all.columns = ['text', 'label']
    print(f'{per_class} reviews per class from {link} loaded')
    return df_all


df = load_sampled(amazon_link, 5000)
df.head()
target = df.label
text = df.text_prep

X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.3, random_state=None)

count = CountVectorizer()
kbest = SelectKBest(f_classif)
param_grid = [{
    'kbest__k': [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000]
}, {
    'kbest': [SelectKBest(f_regression)],
    'kbest__k': [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000]
}, {
    'kbest': [SelectKBest(mutual_info_classif)],
    'kbest__k': [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000]
}]

ml_features = Pipeline([('vect', ???),
                        ('clf', ???)])

gs_ml_features = GridSearchCV(ml_features, param_grid, scoring='f1_macro',
                              cv=5, verbose=1, n_jobs=-1)
gs_ml_features.fit(X_train, y_train)
print('all results:')
print(gs_ml_features.cv_results_)
print('best parameters')
print(gs_ml_features.best_params_)
print('best score')
print(gs_ml_features.best_score_)

