In [67]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_learning_curves
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.under_sampling import RandomUnderSampler

### TRY UNDER FITTING

In [77]:
# Reading in sparse matrix data and transforming.

def Countvec_digestSparse(file):
    x_train_original = pd.read_csv(file, index_col = False, delimiter = ',', header=0)
    oversample = RandomUnderSampler(sampling_strategy='majority')


    # First random oversampling, bring count of class 3 to be equal to the highest class count.
    X_oversampled, y_oversampled = oversample.fit_resample(x_train_original, x_train_original.loc[:,'duration_label'])


    # Second random oversampling, bring the count of class 2 to be equal to the other 2 classes
    X_oversampled, y_oversampled = oversample.fit_resample(X_oversampled, y_oversampled)

    # .value_counts()
    # use recipe name as an example
    train_corpus = X_oversampled.loc[:,['name','steps','ingredients']]
    train_corpus['steps'] = train_corpus['steps'].apply(eval)
    train_corpus['ingredients'] = train_corpus['ingredients'].apply(eval)
    train_corpus['steps'] = train_corpus['steps'].apply(' '.join)
    train_corpus['ingredients'] = train_corpus['ingredients'].apply(' '.join)
    
    
    all_words = []

    for i in range (0, len(train_corpus['steps'])):
        s = ''
        s += train_corpus.loc[i,'name'] + train_corpus.loc[i, 'steps'] + train_corpus.loc[i, 'ingredients']
        all_words.append(s)
        
    vectorizer = CountVectorizer(stop_words='english', max_features = 2000)
    X = vectorizer.fit(all_words)
    vocab_dict = vectorizer.vocabulary_
    # vocab = np.array(vectorizer.get_feature_names())

    X_final = vectorizer.transform(all_words)
    
    return X_final, y_oversampled

    

X_final, y_oversampled = Countvec_digestSparse('recipe_train.csv')

In [72]:
def digestDoc2Vec():
   # Doc2Vect is a technique to transfer words into numerical representation. 
    # https://www.shibumi-ai.com/post/a-gentle-introduction-to-doc2vec
    d2v_ingr = pd.read_csv("recipe_text_features_doc2vec100/train_ingr_doc2vec100.csv", header=None)
    d2v_name = pd.read_csv("recipe_text_features_doc2vec100/train_name_doc2vec100.csv", header=None)
    d2v_steps = pd.read_csv("recipe_text_features_doc2vec100/train_steps_doc2vec100.csv", header=None)




    # Extract class_labels from training set 
    # quick = 1
    # medium = 2
    # slow = 3
    data_train = pd.read_csv('recipe_train.csv')

    # Obtain the labels
    train_label = data_train.iloc[:,-1]




    # Feature selection with f_classif (ANOVA F-value)
    # ANOVA f-value shows how well a feature discriminate between classes
    # The more discrimination, the better that feature is in predicting the class label.
    # d2v_name_new = SelectKBest(k=90).fit_transform(d2v_name, train_label)
    # d2v_ingr_new = SelectKBest(k=90).fit_transform(d2v_ingr, train_label)
    # d2v_steps_new = SelectKBest(k=90).fit_transform(d2v_steps, train_label)
    d2v_name_new = pd.DataFrame(d2v_name)
    d2v_ingr_new = pd.DataFrame(d2v_ingr)
    d2v_steps_new = pd.DataFrame(d2v_steps)
    # print(d2v_ingr_new)
    # print(d2v_steps_new)
    # print(d2v_name_new.shape)


    # Create a new dataframe of data, but this time, name, steps and ingr has been engineered to have doc2vec features.
    # 100 doc2vec features were given, but we selected 20 best features using ANOVA f-value.
    f_data = d2v_name_new.join(data_train.iloc[:,1:3], on=None, how='left', lsuffix='_left', rsuffix='_right')
    f_data = d2v_name_new.join(d2v_steps_new, on=None, how='left', lsuffix='_left', rsuffix='_right')
    f_data = f_data.join(d2v_ingr_new, on=None, how='left', lsuffix='_left', rsuffix='_right')


    # Standardise the data so that the mean is 0
    scaler = StandardScaler()
    f_data = scaler.fit_transform(f_data)

    # normalise all values to be between 0 and 1
    minmax_scaler = MinMaxScaler()
    f_data = minmax_scaler.fit_transform(f_data)





    # Oversampling, because the distribution of classes in training data is highly skewed towards quick and medium.
    oversample = RandomOverSampler(sampling_strategy='minority')

    # First random oversampling, bring count of class 3 to be equal to the highest class count.
    X_oversampled, y_oversampled = oversample.fit_resample(f_data, train_label)

    # Second random oversampling, bring the count of class 2 to be equal to the other 2 classes
    X_oversampled, y_oversampled = oversample.fit_resample(X_oversampled, y_oversampled)






    # Now we should have 20,246 instances for each class.
    # Splitting the provided training into its own train/test
#     X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size=0.2, stratify=y_oversampled, random_state=42)




    # X = pd.DataFrame(X_oversampled)
    # y = pd.DataFrame(y_oversampled)

    # full = X.join(y, on=None, how='left', lsuffix='_left', rsuffix='_right')
    return X_oversampled, y_oversampled

In [73]:
X_oversampled, y_oversampled = digestDoc2Vec()

### Bagging

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size=0.2, stratify=y_oversampled, random_state=42)



KNN = KNeighborsClassifier(n_neighbors=10)
bagging = BaggingClassifier(base_estimator=KNN,n_estimators=10,
                            max_samples=0.5, max_features=0.5, n_jobs=-1,random_state=0)

KNN.fit(X_train,y_train)
bagging.fit(X_train,y_train)
print("KNN:",KNN.score(X_test,y_test))
print("KNN Bagging Accuracy:",bagging.score(X_test,y_test))

KNN: 0.6214191636483372
KNN Bagging Accuracy: 0.5859400724399078


In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_oversampled, test_size=0.2, stratify=y_oversampled, random_state=42)


DT = DecisionTreeClassifier(max_depth=6, criterion='entropy', max_features='log2',random_state=0)
bagging2 = BaggingClassifier(base_estimator=DT,n_estimators=100,
                            max_samples=0.5, max_features=0.5, n_jobs=-1,random_state=0)


DT.fit(X_train,y_train)
bagging2.fit(X_train,y_train)
print("DT test:",DT.score(X_test,y_test))
print("DT train:",DT.score(X_train,y_train))
print("DT Bagging test Accuracy:",bagging2.score(X_test,y_test))
print("DT Bagging train Accuracy:",bagging2.score(X_train,y_train))

DT test: 0.3829268292682927
DT train: 0.39170225747406956
DT Bagging test Accuracy: 0.7056910569105691
DT Bagging train Accuracy: 0.7470002033760423


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_oversampled, test_size=0.2, stratify=y_oversampled, random_state=42)


LinearSVC_clf = svm.LinearSVC(dual=False, multi_class='ovr', random_state=0)
SVC_clf = make_pipeline(StandardScaler(with_mean=False), MaxAbsScaler(), LinearSVC_clf)
forest = RandomForestClassifier(n_estimators=500, max_features='log2', max_depth=2, 
                                criterion='entropy', n_jobs = -1, random_state = 0)
estimators = [
     ('bagging', bagging2),
     ('CountVecSVC', SVC_clf),
      ('Zero_R', DummyClassifier(strategy='most_frequent')),
      ('Boosting',AdaBoostClassifier(n_estimators=100, random_state=0))]


clf = StackingClassifier(
     estimators=estimators, final_estimator=SVC_clf, n_jobs=6
)


clf.fit(X_train, y_train)
y_test_predict_stack = clf.predict(X_test)
y_train_predict_stack = clf.predict(X_train)
print(accuracy_score(y_test, y_test_predict_stack))
print(accuracy_score(y_train, y_train_predict_stack))
print(classification_report(y_test, y_test_predict_stack))
cm = confusion_matrix(y_test, y_test_predict_stack)
cm

0.7772357723577236
0.8759406141956477
              precision    recall  f1-score   support

         1.0       0.72      0.77      0.75       410
         2.0       0.74      0.71      0.73       410
         3.0       0.87      0.85      0.86       410

    accuracy                           0.78      1230
   macro avg       0.78      0.78      0.78      1230
weighted avg       0.78      0.78      0.78      1230



array([[316,  74,  20],
       [ 87, 292,  31],
       [ 34,  28, 348]], dtype=int64)

In [63]:
import scipy
final_test = scipy.sparse.load_npz('CountVec.npz')

# stacking_clf.fit(X_train, y_train)
real_test_pred = clf.predict(final_test)
# final_test.shape

a = [x for x in range(1,10001)]

result = {
    'id': a,
    'duration_label': real_test_pred
}
result = pd.DataFrame(result)
result.to_csv('result.csv', index = False)
real_test_pred.shape

(10000,)