In [1]:
import numpy as np
import pandas as pd

In [2]:
trainset=pd.read_csv('train.csv')
testset=pd.read_csv('test.csv')

In [3]:
columns_to_drop = ['CommentID','UserID','RecipeName','UserName','CreationTimestamp', 'UserReputation', 'ID']
trainset = trainset.drop(columns=columns_to_drop)
testset = testset.drop(columns=columns_to_drop)

In [4]:
trainset['Recipe_Review']=trainset['Recipe_Review'].fillna('')
testset['Recipe_Review']=testset['Recipe_Review'].fillna('')

In [5]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Define your numerical and categorical features
Numc = ['RecipeNumber', 'RecipeCode', 'ReplyCount', 'ThumbsUpCount', 'ThumbsDownCount', 'BestScore']
Catc = ['Recipe_Review']

# Initialize StandardScaler for numerical features
scaler = StandardScaler()

# Initialize TfidfVectorizer for categorical feature
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Define ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', scaler, Numc),
    ('text', tfidf, 'Recipe_Review')
])

preprocessor

In [6]:
from sklearn.model_selection import train_test_split
#separate features from label
X = trainset.drop(columns=['Rating'])
y = trainset['Rating']
#split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
#dummy classifier

from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

# Create and fit a DummyClassifier using the 'most_frequent' strategy
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)

# Make predictions
y_pred = dummy_clf.predict(X_test)

# Calculate accuracy
accuracy_score(y_test, y_pred)


0.7554985337243402

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the pipeline with the preprocessor and RandomForestClassifier
rfc = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))

# Define the hyperparameters to tune
param_grid = {
    'randomforestclassifier__n_estimators': [100, 500, 1000],
    'randomforestclassifier__max_depth': [None, 10, 20, 30],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
print(accuracy_score(y_test, y_pred))


KeyboardInterrupt: 

In [12]:
import xgboost
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgb = make_pipeline(preprocessor, XGBClassifier(random_state=42))

param_grid = {
    'xgbclassifier__n_estimators': [500, 1000],
    'xgbclassifier__max_depth': [ 6, 9],
    'xgbclassifier__learning_rate': [0.01, 0.1],
    'xgbclassifier__gamma': [0, 0.1],
    'xgbclassifier__min_child_weight': [1, 3]
}

grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
print(accuracy_score(y_test, y_pred))


Best Parameters: {'xgbclassifier__gamma': 0.1, 'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 9, 'xgbclassifier__min_child_weight': 1, 'xgbclassifier__n_estimators': 500}
0.7734604105571847


In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Create a pipeline with preprocessing and KNeighborsClassifier
knn = make_pipeline(preprocessor, KNeighborsClassifier())

# Fit the pipeline to the training data
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Calculate accuracy
print(accuracy_score(y_test, y_pred))

report = classification_report(y_test, y_pred)
print(report)

0.7302052785923754
              precision    recall  f1-score   support

           0       0.19      0.11      0.14       257
           1       0.33      0.15      0.20        41
           2       0.00      0.00      0.00        34
           3       0.14      0.02      0.04        85
           4       0.27      0.06      0.10       250
           5       0.78      0.94      0.85      2061

    accuracy                           0.73      2728
   macro avg       0.29      0.21      0.22      2728
weighted avg       0.64      0.73      0.67      2728



In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# Create a pipeline with preprocessing and KNeighborsClassifier
knn = make_pipeline(preprocessor, KNeighborsClassifier())

# Fit the pipeline to the training data
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Calculate accuracy
print(accuracy_score(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

0.7302052785923754
[[  29    5    1    6    8  208]
 [   4    6    2    0    2   27]
 [   3    2    0    0    2   27]
 [  12    1    1    2    5   64]
 [  16    1    0    1   15  217]
 [  88    3    1    5   24 1940]]


In [None]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test, preprocessor are defined previously

# Create a pipeline with preprocessing and LGBMClassifier
lgbm = make_pipeline(preprocessor, LGBMClassifier(random_state=42))

# Define the hyperparameters to tune
param_grid = {
    'lgbmclassifier__n_estimators': [50, 100, 200],
    'lgbmclassifier__learning_rate': [0.01, 0.1, 0.3],
    'lgbmclassifier__max_depth': [3, 6, 9],
    'lgbmclassifier__num_leaves': [20, 30, 40],
}

# Perform grid search cross-validation
grid_search = GridSearchCV(lgbm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.110277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 36708
[LightGBM] [Info] Number of data points in the train set: 8726, number of used features: 1034
[LightGBM] [Info] Start training from score -2.374562
[LightGBM] [Info] Start training from score -4.168788
[LightGBM] [Info] Start training from score -4.355563
[LightGBM] [Info] Start training from score -3.653527
[LightGBM] [Info] Start training from score -2.398239
[LightGBM] [Info] Start training from score -0.271991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37088
[LightGBM] [Info] Number of data points in the train set: 8

In [8]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# Create a pipeline with preprocessing and SVC
svc = make_pipeline(preprocessor, SVC(random_state=42))

# Define the hyperparameters to tune
param_grid = {
    'svc__C': [0.1, 1, 10],              # Regularization parameter
    'svc__kernel': ['linear', 'rbf'],    # Kernel type
    'svc__gamma': ['scale', 'auto']      # Kernel coefficient
}

# Perform grid search cross-validation
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Best Parameters: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Accuracy: 0.7686950146627566


In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Create a pipeline with preprocessing and MLPClassifier
mlp = make_pipeline(preprocessor, MLPClassifier(max_iter=1000, random_state=42))

# Define the hyperparameters to tune
param_dist = {
    'mlpclassifier__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'mlpclassifier__activation': ['relu', 'tanh'],
    'mlpclassifier__solver': ['adam', 'sgd'],
    'mlpclassifier__alpha': [0.0001, 0.001, 0.01],
}

# Perform randomized search cross-validation
random_search = RandomizedSearchCV(mlp, param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




Best Parameters: {'mlpclassifier__solver': 'sgd', 'mlpclassifier__hidden_layer_sizes': (50,), 'mlpclassifier__alpha': 0.0001, 'mlpclassifier__activation': 'tanh'}
Accuracy: 0.7694281524926686


