In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_raw = pd.read_csv('recipe_train.csv')
train_raw.head()

Unnamed: 0,name,n_steps,n_ingredients,steps,ingredients,duration_label
0,wi gal s chicken rice stove top dish made heal...,6,12,"['in saucepan , melt margarine and combine wit...","['margarine', 'olive oil', 'celery', 'onion', ...",2.0
1,irish pin oats,9,5,['melt 1 tbs butter in a small saucepan over m...,"['butter', 'pinhead oats', 'water', 'half-and-...",2.0
2,cheesy beef n biscuit casserole,15,10,"['brown ground beef , onion and green pepper',...","['ground beef', 'onion', 'green pepper', 'toma...",2.0
3,lemonade chicken oamc,10,8,"['brown chicken in oil', 'combine all ingredie...","['boneless skinless chicken', 'frozen lemonade...",2.0
4,graham and peanut butter bon bons,6,5,"['set almond bark aside', 'mix remaining ingre...","['graham cracker crumbs', 'crunchy peanut butt...",2.0


In [3]:
train_raw['steps'] = train_raw['steps'].apply(eval)
train_raw['ingredients'] = train_raw['ingredients'].apply(eval)
train_raw['steps'] = train_raw['steps'].apply(' '.join)
train_raw['ingredients'] = train_raw['ingredients'].apply(' '.join)

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_dev, y_train, y_dev = train_test_split(train_raw.drop(columns='duration_label'), train_raw['duration_label'], test_size=0.25, random_state=3)

## Multinomial Naive Bayes---Count Vectoriser (name+steps+ingredients)

In [5]:
train_text = x_train['name'] + " " + x_train['steps'] + " " + x_train['ingredients'] 
dev_text = x_dev['name'] + " " + x_dev['steps'] + " " + x_dev['ingredients'] 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer(stop_words='english').fit(train_text) 

In [7]:
x_train_steps_cv = cv.transform(train_text)
x_dev_steps_cv = cv.transform(dev_text)

In [8]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train_steps_cv, y_train)
mnb.score(x_train_steps_cv, y_train), mnb.score(x_dev_steps_cv, y_dev)

(0.7538, 0.7213)

In [9]:
# use cross validation to check overfitting
from sklearn.model_selection import cross_val_score
cross_val_score(mnb, x_train_steps_cv, y_train)

array([0.71533333, 0.72433333, 0.72683333, 0.7285    , 0.723     ])

By using the text of steps, ingredients and name combine together, we get a accuracy of 0.7213 for the development set. 
Try only use the text of steps(it have the most words hence we might get the most information form this feature), there is a slight imporvement. 
Then try add in the name feature, the accuracy have a further improvement compare to when we only use the steps.

## Multinomial Naive Bayes---Count Vectoriser (steps)

In [10]:
train_text1 = x_train['steps'] 
dev_text1 = x_dev['steps'] 
cv1 = CountVectorizer(stop_words='english').fit(train_text1)
x_train_steps_cv1 = cv.transform(train_text1)
x_dev_steps_cv1 = cv.transform(dev_text1)
mnb1 = MultinomialNB()
mnb1.fit(x_train_steps_cv1, y_train)
mnb1.score(x_train_steps_cv1, y_train), mnb.score(x_dev_steps_cv1, y_dev)

(0.7598666666666667, 0.7234)

## Multinomial Naive Bayes---Count Vectoriser (name+steps)

In [44]:
train_text2 = x_train['name'] + " " + x_train['steps'] 
dev_text2 = x_dev['name'] + " " + x_dev['steps'] 
cv2 = CountVectorizer(stop_words='english').fit(train_text2)
x_train_steps_cv2 = cv.transform(train_text2)
x_dev_steps_cv2 = cv.transform(dev_text2)
mnb2 = MultinomialNB()
mnb2.fit(x_train_steps_cv2, y_train)
mnb2.score(x_train_steps_cv2, y_train), mnb.score(x_dev_steps_cv2, y_dev)

(0.7692666666666667, 0.7247)

## Test on test set

In [21]:
test_raw = pd.read_csv('recipe_test.csv')

In [24]:
test_raw['steps'] = test_raw['steps'].apply(eval)
test_raw['ingredients'] = test_raw['ingredients'].apply(eval)
test_raw['steps'] = test_raw['steps'].apply(' '.join)
test_raw['ingredients'] = test_raw['ingredients'].apply(' '.join)
test_text = test_raw['name'] + " " + test_raw['steps'] + " " + test_raw['ingredients'] 
x_test_text_cv = cv.transform(test_text)

In [25]:
prediction = mnb.predict(x_test_text_cv)
prediction

array([2., 1., 2., ..., 1., 1., 2.])

In [41]:
results = pd.DataFrame({'id': list(range(1,10001)), 'duration_label': list(prediction)}, columns=['id', 'duration_label'])
results

Unnamed: 0,id,duration_label
0,1,2.0
1,2,1.0
2,3,2.0
3,4,1.0
4,5,2.0
...,...,...
9995,9996,2.0
9996,9997,2.0
9997,9998,1.0
9998,9999,1.0


In [43]:
results.to_csv("results.csv", index=False)

## Multinomial Naive Bayes---tfidf

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english').fit(x_train['steps'])
# train_text2 = name+steps
x_train_steps_tfidf = tfidf.transform(train_text2)
x_dev_steps_tfidf = tfidf.transform(dev_text2)

In [61]:
mnb3 = MultinomialNB()
mnb3.fit(x_train_steps_tfidf, y_train)
mnb3.score(x_train_steps_tfidf, y_train), mnb3.score(x_dev_steps_tfidf, y_dev)

(0.733, 0.6994)

Hence, we can see that by using tfidf to get the weight of each word does not produce a better performance for multinomial NB which is resonable since 

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

In [108]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
mnb_pred = mnb2.predict(x_dev_steps_cv1)
print(confusion_matrix(y_dev, mnb_pred))

[[3004 1386   87]
 [ 978 3964   84]
 [  87   99  311]]


In [109]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_dev, mnb_pred)

# Recall
from sklearn.metrics import recall_score
recall_score(y_dev, mnb_pred)

# Precision
from sklearn.metrics import precision_score
precision_score(y_dev, mnb_pred)

accuracy_score, recall_score, precision_score

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

## Linear SVC---CV

In [62]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [63]:
svc = LinearSVC().fit(x_train_steps_cv, y_train)
svc.score(x_train_steps_cv, y_train), svc.score(x_dev_steps_cv, y_dev)
svc = LinearSVC().fit(x_train_steps_tfidf, y_train)
svc.score(x_train_steps_tfidf, y_train), svc.score(x_dev_steps_tfidf, y_dev)



(0.8772, 0.7828)

## Linear SVC---tfidf

In [64]:
svc = LinearSVC().fit(x_train_steps_tfidf, y_train)
svc.score(x_train_steps_tfidf, y_train), svc.score(x_dev_steps_tfidf, y_dev)

(0.8772, 0.7828)

## Logistic Regression---CV

In [65]:
lr = LogisticRegression(max_iter=1000).fit(x_train_steps_cv, y_train)
lr.score(x_train_steps_cv, y_train), lr.score(x_dev_steps_cv, y_dev)

(0.924, 0.7725)

## Logistic Regression---tfidf

In [66]:
lr = LogisticRegression(max_iter=1000).fit(x_train_steps_tfidf, y_train)
lr.score(x_train_steps_tfidf, y_train), lr.score(x_dev_steps_tfidf, y_dev)

(0.8356333333333333, 0.7876)

## Feature Selection

In [72]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif, f_classif

In [69]:
# use mutual info on cv
kbest = SelectKBest(score_func=mutual_info_classif, k=4000).fit(x_train_steps_cv, y_train)

In [73]:
# use f_classif on tfidf
kbest = SelectKBest(score_func=f_classif, k=4000).fit(x_train_steps_tfidf, y_train)

In [74]:
# use chi2 on tfidf
kbest = SelectKBest(score_func=chi2, k=4000).fit(x_train_steps_tfidf, y_train)

## Grid Search

In [75]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [76]:
# SGDClassifier
sgd = SGDClassifier(loss='log', max_iter=1000).fit(x_train_steps_tfidf, y_train)
sgd.score(x_train_steps_tfidf, y_train), sgd.score(x_dev_steps_tfidf, y_dev)

(0.8016666666666666, 0.7821)

In [77]:
pipeline = Pipeline(
    [
        ('kbest', SelectKBest()), ('clf', SGDClassifier(random_state=2))
    ]
)
parameters = {
    'kbest__score_func': (f_classif, chi2),
    'kbest__k': (1000, 4000, 10000, 'all'),
    'clf__loss': ('hinge', 'log'),
    'clf__alpha': (1e-5, 1e-4, 1e-3, 1e-2),
}

In [78]:
from time import time
def grid_search(pipeline, parameters, X, y):
    gs = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
    
    print('Performing grid search...')
    print('pipeline:', [name for name, _ in pipeline.steps])
    print('parameters:')
    print(parameters)
    t0 = time()
    gs.fit(X, y)
    print("done in %0.3fs" % (time()-t0))
    print()
    
    mean_score = gs.cv_results_['mean_test_score']
    param_set = gs.cv_results_['params']
    for idx in mean_score.argsort()[-5:]:
        print(param_set[idx])
        print(gs.cv_results_['mean_test_score'][idx])
        print('='*30)
    
    return gs

In [79]:
result = grid_search(pipeline, parameters, x_train_steps_tfidf, y_train)

Performing grid search...
pipeline: ['kbest', 'clf']
parameters:
{'kbest__score_func': (<function f_classif at 0x000001CFF1AF2310>, <function chi2 at 0x000001CFF1AF2670>), 'kbest__k': (1000, 4000, 10000, 'all'), 'clf__loss': ('hinge', 'log'), 'clf__alpha': (1e-05, 0.0001, 0.001, 0.01)}
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:   22.3s finished


done in 22.858s

{'clf__alpha': 0.0001, 'clf__loss': 'hinge', 'kbest__k': 10000, 'kbest__score_func': <function f_classif at 0x000001CFF1AF2310>}
0.7863
{'clf__alpha': 0.0001, 'clf__loss': 'hinge', 'kbest__k': 4000, 'kbest__score_func': <function chi2 at 0x000001CFF1AF2670>}
0.7871333333333334
{'clf__alpha': 0.0001, 'clf__loss': 'hinge', 'kbest__k': 10000, 'kbest__score_func': <function chi2 at 0x000001CFF1AF2670>}
0.7874
{'clf__alpha': 0.0001, 'clf__loss': 'hinge', 'kbest__k': 'all', 'kbest__score_func': <function f_classif at 0x000001CFF1AF2310>}
0.7881
{'clf__alpha': 0.0001, 'clf__loss': 'hinge', 'kbest__k': 'all', 'kbest__score_func': <function chi2 at 0x000001CFF1AF2670>}
0.7881


From the result of the SDG classifier, we can see that the models with the top performance all have a alpha of 0.0001 and hinge loss. The score function does not make a much difference.

## Ensembel Model

### 1.max voting

In [93]:
from sklearn.metrics import accuracy_score
from statistics import mode

pred1=svc.predict(x_dev_steps_tfidf)
pred2=lr.predict(x_dev_steps_tfidf)
pred3=sgd.predict(x_dev_steps_tfidf)

final_pred = np.array([])
for i in range(0,len(x_dev)):
    final_pred = np.append(final_pred, mode([pred1[i], pred2[i], pred3[i]]))
    
accuracy_train = accuracy_score(y_dev,final_pred)
accuracy_train

0.7882

### test on the test set use max voting

In [98]:
test_text = test_raw['name'] + " " + test_raw['steps']
x_test_steps_tfidf = tfidf.transform(test_text)

pred1_test=svc.predict(x_test_steps_tfidf)
pred2_test=lr.predict(x_test_steps_tfidf)
pred3_test=sgd.predict(x_test_steps_tfidf)

prediction2 = np.array([])
for i in range(0,len(test_text)):
    prediction2 = np.append(prediction2, mode([pred1_test[i], pred2_test[i], pred3_test[i]]))
    
results2 = pd.DataFrame({'id': list(range(1,10001)), 'duration_label': list(prediction2)}, columns=['id', 'duration_label'])
results2.to_csv("results2.csv", index=False)

### 2.Random Forest

In [100]:
from sklearn.ensemble import RandomForestClassifier
accuracy_train_list = []
for i in range(10,21):
    rf = RandomForestClassifier(max_depth=i)

    # fit the model with the training data
    rf.fit(x_train_steps_tfidf, y_train)

    # predict the target on the train dataset
    predict_train = rf.predict(x_dev_steps_tfidf)

    # Accuray Score on train dataset
    accuracy_train = accuracy_score(y_dev,predict_train)
    accuracy_train_list.append(accuracy_train)
accuracy_train_list

[0.73,
 0.734,
 0.7347,
 0.7338,
 0.7391,
 0.7424,
 0.7408,
 0.7444,
 0.7505,
 0.7485,
 0.7546]

From the result we can see that the accuracy increase as the max depth of the tree increase.

In [105]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=40)

# fit the model with the training data
rf.fit(x_train_steps_tfidf, y_train)

# predict the target on the train dataset
predict_train = rf.predict(x_dev_steps_tfidf)

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_dev,predict_train)
accuracy_train

0.7778

max_depth=40, n_estimators=100: 0.77
max_depth=40, n_estimators=200: 0.7776
max_depth=100, n_estimators=100: 0.778

varie the max depth or the number of trees can improve the accuracy but does not lead to a large change where it increases the complexity of the model.

## K-NN (not complete)

In [51]:
X_train1 = pd.DataFrame({'n_steps': x_train['n_steps'] , 'n_ingredients': x_train['n_ingredients']}, columns=['n_steps', 'n_ingredients'])

In [52]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train1, y_train)
classifier.score(X_train1, y_train)

0.6015

In [None]:
X_train1 = pd.DataFrame({'n_steps': x_train['n_steps'] , 'n_ingredients': x_train['n_ingredients']}, columns=['n_steps', 'n_ingredients'])
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train1, y_train)
classifier.score(X_train1, y_train)

## Decision Tree (not complete)

In [54]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
decision_tree = decision_tree.fit(x_train_steps_cv, y_train)
r = export_text(decision_tree, feature_names=iris['feature_names'])
print(r)

NameError: name 'iris' is not defined

In [47]:
x_train_steps_cv

<30000x20724 sparse matrix of type '<class 'numpy.int64'>'
	with 1571310 stored elements in Compressed Sparse Row format>

In [49]:
x_train.columns.values

array(['name', 'n_steps', 'n_ingredients', 'steps', 'ingredients'],
      dtype=object)