In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_learning_curves
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.feature_selection import SelectKBest, chi2
# from Learning_Curve import plot_learning_curve

x_train_original = pd.read_csv("../data/COMP30027_2021_Project2_datasets/recipe_train.csv", index_col = False, delimiter = ',', header=0)
oversample = RandomOverSampler(sampling_strategy='minority')

train_label = x_train_original.loc[:,'duration_label']
X_train, X_test, y_train, y_test = train_test_split(x_train_original,train_label, test_size=0.3, stratify=train_label, random_state=42)



# First random oversampling, bring count of class 3 to be equal to the highest class count.
X_oversampled, y_oversampled = oversample.fit_resample(X_train, y_train)


# Second random oversampling, bring the count of class 2 to be equal to the other 2 classes
X_oversampled, y_oversampled = oversample.fit_resample(X_oversampled, y_oversampled)

# .value_counts()
# use recipe name as an example
train_corpus = X_oversampled.loc[:,['name','steps','ingredients']]
test_corpus = X_test.loc[:,['name','steps','ingredients']]
train_corpus

Unnamed: 0,name,steps,ingredients
0,ricotta and spinach stuffed pasta shells,['cook pasta in a large saucepan of boiling wa...,"['pasta shells', 'spinach', 'low-fat ricotta c..."
1,onion crusted chicken,"['in a bowl , mix the eggs and the 2 tablespoo...","['boneless chicken breasts', 'onion soup mix',..."
2,gourmet chicken burger australia,"['combine mince , bacon , onion , vegemite and...","['chicken', 'bacon', 'onion', 'vegemite', 'bre..."
3,peppery parsnip fries,['cut parsnips lengthwise into 2 1 / 2 x 1 / 2...,"['parsnips', 'olive oil', 'parmesan cheese', '..."
4,shells with coconut thai chicken,"['1', 'in large pot , cook pasta in boiling sa...","['pasta', 'salt', 'green beans', 'olive oil', ..."
...,...,...,...
42511,vegetable hoagie,"['spread guacamole on cut-sides of buns', 'to ...","['guacamole', 'hoagie rolls', 'monterey jack c..."
42512,spicy chicken enchiladas,"['in a bowl , combine the chicken , 1 cup chee...","['southwestern-seasoned chicken breast', 'ched..."
42513,rhubarb schnapps,['chop the rhubarb and divide it between two 1...,"['rhubarb', 'sugar', 'vodka']"
42514,krab cakes,['mix well all ingredients except flour and oi...,"['imitation crabmeat', 'scallions', 'dried dil..."


In [2]:


# Joining columns for train
train_corpus['steps'] = train_corpus['steps'].apply(eval)
train_corpus['ingredients'] = train_corpus['ingredients'].apply(eval)
train_corpus['steps'] = train_corpus['steps'].apply(' '.join)
train_corpus['ingredients'] = train_corpus['ingredients'].apply(' '.join)


# Joining columns for test
test_corpus['steps'] = test_corpus['steps'].apply(eval)
test_corpus['ingredients'] = test_corpus['ingredients'].apply(eval)
test_corpus['steps'] = test_corpus['steps'].apply(' '.join)
test_corpus['ingredients'] = test_corpus['ingredients'].apply(' '.join)

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(corpus)


In [None]:
test_corpus

### Putting the features of all instances into a Bag of Words
Used to make CountVec and TF-IDF Sparse matrix

In [3]:

all_words = []

for i in range (0, len(train_corpus['steps'])):
    s = ''
    s += train_corpus.loc[i,'name'] + train_corpus.loc[i, 'steps'] + train_corpus.loc[i, 'ingredients']
    all_words.append(s)

all_words

['ricotta and spinach stuffed pasta shellscook pasta in a large saucepan of boiling water , uncovered , 3 minutes drain cool slightly preheat oven to moderate boil , steam or microwave spinach until just wilted drain chop spinach finely , squeeze out excess liquid combine spinach in a large bowl with ricotta and cottage cheese spoon spinach mixture into pasta shells combine sauce and stock in oiled shallow 2 litre ovenproof dish place pasta shells in dish sprinkle with parmesan bake , covered , in moderate oven about 1 hour or until pasta is tenderpasta shells spinach low-fat ricotta cheese low fat cottage cheese pasta sauce vegetable stock parmesan cheese',
 'onion crusted chickenin a bowl , mix the eggs and the 2 tablespoons of water and stir well in a seperate bowl combine the onion soup mix and the breadcrumbs heat the pan and spray a good thick coating of pam spray to the pan , or you can use the oil if you prefer cut the chicke breasts in half dip the chicken into the egg mixture

In [4]:


test_corpus = test_corpus.reset_index()
all_words_test = []

for j in range (0, len(test_corpus['steps'])):
#     print(j)
    s = ''
    s += test_corpus.loc[j,'name'] + test_corpus.loc[j, 'steps'] + test_corpus.loc[j, 'ingredients']
    all_words_test.append(s)
    
all_words_test

['super easy donuts   basic recipebasic donuts: line jellyroll pan with foil coat with cooking spray on lightly floured surface , roll dough out to 14" x 6" rectangle using 2 3 / 4" round biscuit cutter , cut out 12 donuts arrange on foil , 2" apart cover lightly with damp paper towels let rise in warmplace untildoubled in size , about 2 hours in a large pot , heat 2" of oil over medium heat until 300f on deep-frying thermometer add donuts in batches and cook , turning once , until golden and cooked through , about 5-7 minutes drain on rack strawberry-frosted donuts: prepare basic donuts recipe , using 3"round donut cutter in place of biscuit cutter cook as directed if desired , for donut holes , cook center cut outs for 5 minutes place strawberry frosting in microwave safe bowl and cook on high for about 30 seconds to soften spoon some frosting over each donut decorate with sprinkles and let stand for 15 minutes to set simple jelly donuts: prepare and cook basic donuts as directed onc

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
# stop_word ='english' means deleting some common words like the, a,.... in list 'english'



vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2),max_features = 5000)
X = vectorizer.fit(all_words)
vocab_dict = vectorizer.vocabulary_
vocab = np.array(vectorizer.get_feature_names())

X_final = vectorizer.transform(all_words)
X_t_final = vectorizer.transform(all_words_test)
# X_t_final = select.transform(X_t_final)
X_t_final.shape, X_final.shape

((12000, 5000), (42516, 5000))

In [None]:
X_final.shape

In [None]:
# vectorizer_t = CountVectorizer(stop_words='english',ngram_range=(1,2), max_features = 10000)
# X_t = vectorizer_t.fit(all_words_test)
# vocab_dict = vectorizer_t.vocabulary_
# # vocab = np.array(vectorizer.get_feature_names())

# X_t_final = vectorizer_t.transform(all_words_test)
# # X_t_final = select.transform(X_t_final)
# X_t_final.shape

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(stop_words='english', max_features = 450)
X_tfidf = tf_idf.fit(all_words)
# vocab = np.array(vectorizer.get_feature_names())

X_tfidf = tf_idf.transform(all_words)
# y_oversampled.shape
tf_idf.idf_

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_t = TfidfVectorizer(stop_words='english', max_features = 1000)
X_tfidf_test = tf_idf_t.fit(all_words_test)

X_tfidf_ftest = tf_idf_t.transform(all_words_test)
X_tfidf_ftest.shape

### Linear SVM

In [14]:
from sklearn import svm


LinearSVC_clf = svm.LinearSVC(C=0.005,dual=False, multi_class='ovr',random_state=0)
SVC_clf = make_pipeline(MaxAbsScaler(), LinearSVC_clf)
SVC_clf.fit(X_final, y_oversampled)

y_test_predict = SVC_clf.predict(X_t_final)
y_train_predict = SVC_clf.predict(X_final)
print(accuracy_score(y_test, y_test_predict))
print(accuracy_score(y_oversampled, y_train_predict))
print(classification_report(y_test, y_test_predict))
cm = confusion_matrix(y_test, y_test_predict)
cm

0.7816666666666666
0.8550428074136796
              precision    recall  f1-score   support

         1.0       0.76      0.84      0.79      5311
         2.0       0.86      0.73      0.79      6074
         3.0       0.52      0.81      0.63       615

    accuracy                           0.78     12000
   macro avg       0.71      0.79      0.74     12000
weighted avg       0.80      0.78      0.78     12000



array([[4440,  675,  196],
       [1363, 4444,  267],
       [  62,   57,  496]], dtype=int64)

### Attempting to plot learning curve of Accuracy vs Regularization value

Problem of image size being too large is encountered. Do not run the box below

In [None]:
#Testing C values.
Cs = [0.000000001,0.00000001,0.0000001,0.000001,0.00001,0.0001,0.001, 0.01, 0.1, 1]

SVM_test = []
SVM_train = []
for c in Cs:
    LinearSVC_clf = svm.LinearSVC(C=c,dual=False, multi_class='ovr', random_state=0)
    SVC_clf = make_pipeline(StandardScaler(with_mean=False), MaxAbsScaler(), LinearSVC_clf)
    SVC_clf.fit(X_final, y_oversampled)
    y_test_predict = SVC_clf.predict(X_t_final)
    y_train_predict = SVC_clf.predict(X_final)
    
    SVM_test.append(y_test_predict)
    SVM_train.append(y_train_predict)
    



new_C = ['1.0e-9','1.0e-8', '1.0e-7', '1.0e-6', '1.0e-5', '1.0e-4', '1.0e-3', '1.0e-2', '1.0e-1', '1']
plt.plot(Cs, SVM_train, label = "train")
plt.plot(Cs, SVM_test, label = "test")
plt.xlabel('C value')
plt.ylabel('accuracy')
plt.legend()
plt.show()

### Sklearn example to plot Accuracy vs Regularization value

Does not tell much, as the used for cros validation was already oversampled and hence accuracy is very high

In [None]:
from sklearn.model_selection import validation_curve
param_range = np.logspace(-9, 2, 10)
param_range
LinearSVC = svm.LinearSVC(dual=False, multi_class='ovr', random_state=0)
train_scores, test_scores = validation_curve(
    LinearSVC, X_final, y_oversampled, param_name="C", param_range=param_range,
    scoring="accuracy", n_jobs=-1)


In [None]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [None]:
plt.title("Validation Curve with SVM")
plt.xlabel("C")
plt.ylabel("Accuracy Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

In [None]:
SVC_clf = make_pipeline(MaxAbsScaler(), LinearSVC_clf)
plot_learning_curves(X_final, y_oversampled, X_t_final, y_test, SVC_clf, scoring= 'accuracy')

In [None]:
# Cross Validation To evaluate model
# Shuffle Split is a random split method, not guranteed unique split but likely on large data.
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=5, test_size=0.4, random_state=42)
scores = cross_val_score(LinearSVC_clf, X_final, y_oversampled, cv=cv)
scores

### Not using Gaussian Naive Bayes Classifier
GNB was not implemented itself as we deemed it unfit for this classification task, 2 main reasons.
1) Features cannot be assumed to be indepedent from one another, for example ingredients can appear in the name, and or steps   
2) Blah blah blah

### Learning curve for SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

clf_SGD = SGDClassifier(alpha=10, max_iter=1000, tol=1e-3, n_jobs=6, random_state=0)
plot_learning_curves(X_final, y_oversampled, X_t_final, y_test, clf_SGD, scoring= 'accuracy')
plt.show()

In [None]:
SGD = make_pipeline(StandardScaler(with_mean=False), MaxAbsScaler(),clf_SGD)
SGD.fit(X_final, y_oversampled)

y_test_predict_SGD = SGD.predict(X_t_final)
y_train_predict_SGD = SGD.predict(X_final)
print("test: ", accuracy_score(y_test, y_test_predict_SGD))
print("train: ", accuracy_score(y_oversampled, y_train_predict_SGD))
print(classification_report(y_test, y_test_predict_SGD))
cm = confusion_matrix(y_test, y_test_predict_SGD)
cm
X_train.shape

### Stacking Implementation

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


estimators = [
     ('CountVecSVC', SVC_clf),
     ('SGD', SGD)]

stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(n_jobs=6, random_state=0), n_jobs=6)

stacking_clf.fit(X_final, y_oversampled)


y_test_predict_stack = stacking_clf.predict(X_t_final)
y_train_predict_stack = stacking_clf.predict(X_final)
print(accuracy_score(y_test, y_test_predict_stack))
print(accuracy_score(y_oversampled, y_train_predict_stack))
print(classification_report(y_test, y_test_predict_stack))
cm = confusion_matrix(y_test, y_test_predict_stack)
cm


### Running model on REAL Test data

In [None]:
import scipy
final_test = scipy.sparse.load_npz('CountVec.npz')

# stacking_clf.fit(X_train, y_train)
LinearSVC_real_test_pred = SVC_clf.predict(final_test)
LinearSVC_real_test_pred
# final_test.shape



In [None]:
a = [x for x in range(1,10001)]

result = {
    'id': a,
    'duration_label': real_test_pred
}
result = pd.DataFrame(result)
result.to_csv('result.csv', index = False)