In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **In this notebook I intend to explore a dataset looking at different types of Ramen. I intend to do some basic exploratory data analysis and see what I can learn.**

Here I explore what makes ramen great and where it comes from!

To start with I do some basic exploratory data analysis to try and better understand the data.

Generally, the data has a lot of catergorical variables, so the project focuses on feature engineering using NLP.

The final step includes trying to make a predictive model for the rating of sushi based on the features. I employ the use of a number of machine learning techniques.

Overall I found that the models were not so accurate at predicting lower ratings. This is likely due to the dataset being skewed towards mostly higher ratings, a broader dataset may therefore increase accuracy. Nonetheless, this was a fun project

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rcParams['font.size'] = '15'

In [None]:
df = pd.read_csv('/kaggle/input/ramen-ratings/ramen-ratings.csv')

In [None]:
df.info()

Concerned that some of the catergories I would expect to be numeric are objects, espcially the stars

In [None]:
df.Stars.unique()

In [None]:
df.Stars.replace('Unrated',np.nan,inplace=True)
df['Stars'] = pd.to_numeric(df.Stars)
df.info()

Thats better, now the stars are listed as numeric, lets recheck the dataframe

In [None]:
df.head(5)

**Initial thoughts**

The review number column appears to just be an index so I will drop that. It seems like we could have a lot of infomation in the Variety column so it would be good to do some feature extraction on that. First, however, I want to do some basic exploratory data analysis to better understand the raw data

In [None]:
df.drop('Review #',axis=1,inplace=True)
print('The number of unique Brands is '+ str(len(df.Brand.unique())))
print('The number of unique countrys is '+ str(len(df.Country.unique())))

In [None]:
df_sorted = df.groupby('Country').count().sort_values('Brand',ascending=False).Brand
df_sorted = df_sorted.reset_index().copy()

In [None]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
plt.ylabel('Count')
plt.bar(df_sorted.Country,df_sorted.Brand)

**Now I want to see the average ramen rating per country, and also add infomation on how many ramens are produced per country**

In [None]:
df_ave_stars= df[['Country','Stars']].groupby('Country').mean().sort_values('Stars',ascending=False).reset_index()
df_joined = df_ave_stars.set_index('Country').join(df_sorted.set_index('Country')).reset_index()
plt.figure(figsize=(15,5))
plt.xticks(rotation=90)
#plt.hist(df_ave_stars.Stars,bins = 15)
plt.ylabel('Average Ramen Rating')
sizes = dict(zip(df_joined.Country,df_joined.Brand.values*10))
sns.scatterplot('Country','Stars', data=df_joined,
                size='Country',
                sizes=sizes,
                legend=False,
                alpha=0.5,
                edgecolor='black',
                palette = 'Set1'
                )


In [None]:
plt.figure(figsize=(15,5))
plt.grid()
plt.hist(df.Stars,bins=20,edgecolor='k',align='mid')
plt.xlabel('Stars')
plt.ylabel('Number')

In [None]:
df_ave_stars_style = df[['Style','Stars']].groupby('Style').mean().sort_values('Stars',ascending=False).reset_index()
plt.figure(figsize=(10,5))
#plt.hist(df_ave_stars.Stars,bins = 15)
plt.ylabel('Average Ramen Rating')
plt.grid()
sns.boxplot(df.Style,df.Stars,palette = 'Set2')


# Now to do some feature extraction from the variety column. It would be good to try and get an idea of flavour

**First I need to remove the punctuation**

In [None]:
import string 
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

In [None]:
df.loc[:,'Variety'] = df.loc[:,'Variety'].apply(lambda x: remove_punctuation(x.lower()))

**Now to remove stopwords such as 'the', 'and' etc...**

In [None]:
from nltk.tokenize import RegexpTokenizer

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
df.loc[:,'Variety'] = df.loc[:,'Variety'].apply(lambda x: tokenizer.tokenize(x.lower()))
df.Variety

In [None]:
from nltk.corpus import stopwords
specified_extra = ['noodles','noodle','flavour','artificial','ramen','instant','flavor','sauce','cup','bowl','rice']
def remove_stopwords(text):
    english_words = [w for w in text if w not in stopwords.words('english')]
    additional_words = [w for w in english_words if w not in specified_extra]
    return additional_words

In [None]:
df.loc[:,'Variety'] = df.loc[:,'Variety'].apply(lambda x: remove_stopwords(x))

**Now to recombine for analysis. I also take the opportunity to remove any repeats in the lists, before recombining**

In [None]:
df.loc[:,'Variety'] = df.loc[:,'Variety'].apply(lambda x: list(set(x)))
df.loc[:,'Variety'] = df.loc[:,'Variety'].apply(lambda x:" ".join(x))

In [None]:
# Begin vectorisation
from sklearn.feature_extraction.text import CountVectorizer

Now to generate the sparce matrix with all the top words contained andto extract the top words and counts 

In [None]:
# Might take awhile...
max_feature_length = 10
top_words = []

bow_transformer = CountVectorizer(max_features=max_feature_length,ngram_range=(1,1)).fit(df.loc[:,'Variety'])
bow = bow_transformer.transform([' '.join(df.loc[:,'Variety'].values)])#This joins all the words in all the rows 
word_list = bow_transformer.get_feature_names()
count_list = bow.toarray().sum(axis=0) 
top_counts = pd.DataFrame(zip(word_list,count_list),columns=['term','count',])
top_counts.sort_values('count',axis=0,inplace=True, ascending=False)
top_counts

Create a new column with the flavours identified

In [None]:
df['flavour'] = df.Variety.apply(lambda y: np.array([x for x in y.split() if x in top_counts.term.values]))
df['flavour'] = df['flavour'].apply(lambda x :" ".join(x))

## Now I have the flavours identified, I want to see how it varies with the rating
First I create a new dataframe, with the flavours spilt and add the rating column 

In [None]:
flavour = df['flavour'].str.split(' ', 3,expand=True)

In [None]:
flavour['stars'] = df['Stars']
flavour.replace('',np.nan,inplace=True)
flavour.replace('tom','tom yum',inplace=True) # Change tom to tom yum 
flavour['flavour'] = flavour.iloc[:,0]

In [None]:
plt.figure(figsize=(15,5))
plt.xticks(rotation=90)
sns.boxplot('flavour','stars',data=flavour, palette = 'Set2')

**It seems that curry noodles are often rated highly. Seafood and shrimp along with chicken seem to be generally worse.** Seeing as some rows also have secondary flavours, this could be a great predictor of ratings, along with country, style and brand.

## From here I begin to make dummy variables for the flavour and try and predict the rating.

Now to create dummy variables for the top words found

As there are a large number of countries and brands I use the BaseN encoder to reduce the dimentionality. For style I simply use get dummies.

In [None]:
import category_encoders as ce

In [None]:
encoder = ce.BaseNEncoder(cols=['Brand'],return_df=True,base=5)
data_encoded_brand = encoder.fit_transform(df.Brand)

In [None]:
encoder = ce.BaseNEncoder(cols=['Country'],return_df=True,base=5)
data_encoded_Country = encoder.fit_transform(df.Country)

In [None]:
data_encoded_styles = pd.get_dummies(data=df.Style,drop_first=True)

In [None]:
encoder = ce.BaseNEncoder(cols=['flavour'],return_df=True,base=5)
data_encoded_flavour = encoder.fit_transform(flavour.flavour)

In [None]:
final_cleaned = data_encoded_flavour.join(data_encoded_brand).join(data_encoded_styles).join(data_encoded_Country)
final_cleaned['Stars'] = df.Stars

Now to check that all columns are numeric and ready to be used in prediction

In [None]:
final_cleaned.info()

Flavour_0, Brand_0 and Country_0 all appear to have no values

In [None]:
final_cleaned.drop(['flavour_0','Brand_0','Country_0'],axis=1,inplace=True)

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(final_cleaned.corr(),annot=True,cmap='viridis')

**Overall it actually seems like nothing is very correlated to stars, it therefore seems like machine learning models may struggle to predict the rating of Ramen.** Nonetheless, I apply some to confirm this hypothesis.

# Now to employ some ML techniques to predict the rating 


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
final_cleaned.dropna(inplace=True)
x = final_cleaned.iloc[:,0:-1].values
y = final_cleaned.Stars.values

In [None]:
x_train, x_test, y_train,  y_test = train_test_split(x, y, test_size=0.25,)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(x_train, y_train)
y_predict = model.predict(x_test)

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test,y_predict)
plt.xlim([-0.5,5.5])
plt.xlabel('Y Actual')
plt.ylabel('Y Predicted')
plt.ylim([-0.5,5.5])

In [None]:
from sklearn import neighbors
from sklearn.metrics import mean_squared_error 
from math import sqrt

In [None]:
rmse_val = [] #to store rmse values for different k
for K in range(20):
    K += 1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)
    model.fit(x_train, y_train)  #fit the model
    pred = model.predict(x_test) #make prediction on test set
    error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

In [None]:
model = neighbors.KNeighborsRegressor(n_neighbors = 7)
model.fit(x_train, y_train)  #fit the model
y_predict_KNN = model.predict(x_test) #make prediction on test set
plt.figure(figsize=(15,10))
plt.scatter(y_test,y_predict_KNN)
plt.xlim([-0.5,5.5])
plt.xlabel('Y Actual')
plt.ylabel('Y Predicted')
plt.ylim([-0.5,5.5])

**As I expected the traditional machine learning models struggle to predict the rating of the ramen. We could try deep learning here to confirm this, but i'm not too hopeful**

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
model = Sequential()
model.add(Dense(15, input_dim=15, activation= "relu"))
model.add(Dense(10, activation= "relu"))
model.add(Dense(10, activation= "relu"))
model.add(Dense(5, activation= "relu"))
model.add(Dense(1))

In [None]:
model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
history = model.fit(x_train, y_train, epochs=100)

In [None]:
pred_train= model.predict(x_train)
print(np.sqrt(mean_squared_error(y_train,pred_train)))

y_pred_NN = model.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,pred)))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(np.arange(0,100),history.history.get('loss'))
plt.xlabel('Epoch')
plt.ylabel('MSE')

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test,y_pred_NN)
plt.xlim([-0.5,5.5])
plt.xlabel('Y Actual')
plt.ylabel('Y Predicted')
plt.ylim([-0.5,5.5])

Overall given the lack of correlation between the predictors and the rating it has proved hard to generate an accurate machine learning model. Nonetheless, its been fun to use some NLP, ML and generally explore the Ramen Data set