In [40]:
from nltk.corpus import stopwords
import re
import pandas as pd
df = pd.read_csv('cuisine_data.csv')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

In [41]:
df.head()

Unnamed: 0,cuisine_description,cuisine
0,romaine lettuce black olives grape tomatoes ga...,greek
1,plain flour ground pepper salt tomatoes ground...,southern_us
2,eggs pepper salt mayonaise cooking oil green c...,filipino
3,water vegetable oil wheat salt,indian
4,black pepper shallots cornflour cayenne pepper...,indian


In [42]:
len(df.cuisine.value_counts())

20

In [43]:
df.cuisine.unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

# Checking for NULLs and removing them

In [44]:
df.isnull().sum()

cuisine_description    0
cuisine                0
dtype: int64

# Dropping duplicates

In [45]:
df.shape

(39774, 2)

In [46]:
df.drop_duplicates(inplace=True)

In [47]:
df.shape

(39677, 2)

# Preprocessing the text

In [48]:
print(df['cuisine_description'].apply(lambda x: len(x.split(' '))).sum())

806112


In [49]:
import nltk
nltk.download('stopwords')
special_character_remover = re.compile('[/(){}\[\]\|@,;]')
extra_symbol_remover = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
def clean_text(text):
    text = text.lower()
    text = special_character_remover.sub(' ', text)
    text = extra_symbol_remover.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

df['cuisine_description'] = df['cuisine_description'].apply(clean_text)

In [51]:
print(df['cuisine_description'].apply(lambda x: len(x.split(' '))).sum())

803337


# Train Test split

In [52]:
from sklearn.model_selection import train_test_split
X = df.cuisine_description
y = df.cuisine
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [53]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((27773,), (11904,), (27773,), (11904,))

In [54]:
X_train.isna().sum()

0

In [55]:
y_train.isna().sum()

0

In [56]:
y_train.fillna(y_train.mode(),inplace=True)

# Applying Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression()),
              ])

lr.fit(X_train,y_train)
y_pred1 = lr.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_pred1,y_test)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy is : 0.7819220430107527


In [69]:
X_train.shape


(27773,)

In [63]:
X_train.head()

29300    ground ginger large eggs custard freshly groun...
13896    fresh tomatoes olive oil half half allpurpose ...
27312    black pepper cinnamon purple onion lamb fresh ...
9658     chili flakes minced garlic vegetable stock thy...
4410     cotija vegetable oil cilantro leaves crema mex...
Name: cuisine_description, dtype: object

In [66]:
filename = 'checkdata.xlsx'

X_train.to_excel(filename)

In [67]:
filename = 'output.xlsx'

y_train.to_excel(filename)

In [68]:
print(y_pred1)

['cajun_creole' 'southern_us' 'italian' ... 'italian' 'mexican' 'french']


In [76]:
data = ['chili flakes minced garlic vegetable stock thyme royal olives fennel salt bread pepper leeks san marzano diced tomatoes white wine olive oil old bay seasoning']

pred = lr.predict(data)

print(pred)


['italian']


In [78]:
# saving the model

import pickle

with open('lr.pkl', 'wb') as model_file:
    pickle.dump(lr, model_file)

