# Exploratory analysis of the Ramen Kaggle Data

In [1]:
import pandas as pd
import numpy as np
import os
# we will use pathlib later in the py file
from pathlib import Path
# custom made functions (can be a pain to import sometimes)
import sys
sys.path.insert(0,os.path.abspath('../src/helper'))
from customPandas import *

In [2]:
# BASE_DIR = Path(__file__).resolve().parent.parent

In [3]:
# https://stackoverflow.com/questions/39125532/file-does-not-exist-in-jupyter-notebook
dataPath = os.path.abspath('../data')
fileName = 'ramen-ratings.csv'
df = pd.read_csv(f'{dataPath}/{fileName}') 

In [4]:
df.head()

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0,
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580 entries, 0 to 2579
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Review #  2580 non-null   int64 
 1   Brand     2580 non-null   object
 2   Variety   2580 non-null   object
 3   Style     2578 non-null   object
 4   Country   2580 non-null   object
 5   Stars     2580 non-null   object
 6   Top Ten   41 non-null     object
dtypes: int64(1), object(6)
memory usage: 141.2+ KB


In [6]:
df.shape

(2580, 7)

In [7]:
df.describe()

Unnamed: 0,Review #
count,2580.0
mean,1290.5
std,744.926171
min,1.0
25%,645.75
50%,1290.5
75%,1935.25
max,2580.0


In [8]:
pandasExploratoryData(df,['cat', 'num', 'gen', 'missing'])

Only printing the first 5 unique variables and 75 chars

Categorical variables --------------------------------------------


There are 355 unique val for Brand

There are 2413 unique val for Variety

There are 7 unique val for Style

There are 38 unique val for Country

There are 51 unique val for Stars

There are 38 unique val for Top Ten

Numerical variables --------------------------------------------

There are 2580 unique Review #
The median is  1290.5, mean 1290.5


Basic information --------------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580 entries, 0 to 2579
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Review #  2580 non-null   int64 
 1   Brand     2580 non-null   object
 2   Variety   2580 non-null   object
 3   Style     2578 non-null   object
 4   Country   2580 non-null   object
 5   Stars     2580 non-null   object
 6   Top Ten   41 non-null     object
dtypes: in

## Cleaning the data
We are dropping the review # column because it doesn't provide any relevant info and we are dropping Top Ten because honestly only 41/2k+ rows are populated.

In [9]:
def cleanStars(value):
    if value == 'Unrated':
        return np.nan
    else:
        return value
df.Stars = df.Stars.apply(cleanStars) 
df[df.Stars == 'Unrated'].Stars

Series([], Name: Stars, dtype: object)

In [10]:
# dropping Columns
df = df.drop(['Review #','Top Ten'],axis=1)

In [11]:
# changing an incorrect dtype
df = df.astype({'Stars':'float64'})

In [12]:
# for the test train split later we separate Stars
Y = df['Stars']
df = df.drop(['Stars'],axis=1)

In [13]:
Y.head()

0    3.75
1    1.00
2    2.25
3    2.75
4    3.75
Name: Stars, dtype: float64

In [14]:
df.head()

Unnamed: 0,Brand,Variety,Style,Country
0,New Touch,T's Restaurant Tantanmen,Cup,Japan
1,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan
2,Nissin,Cup Noodles Chicken Vegetable,Cup,USA
3,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan
4,Ching's Secret,Singapore Curry,Pack,India


## Feature engineering

We need to do some NLP in the Variety section so that we keep the relecant elements like ingredients(e.g. curry, sesame), location(e.g. singapore) and relevant adjectives (e.g. Hot, sour) 

In [15]:
from nltk.corpus import stopwords
import string
def nltkPreprocess(text):
    """
    Description:
        - Takes a text and cleans it by removing useless punctuations and stopwords
    input:
        -string
    output:
        - list of strings
    """
    # lower
    words = text.lower().split()
    # remove punctiuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    clean = [w for w in stripped if not w in stop_words if len(w)>2]
    return clean 


def cleanVariety(col):
    """
    description:
        - adaptation for 
    """
    cleanCol = list()

    for v in col['Variety'].to_list():

        cleanCol.append(nltkPreprocess(v))
    return col

In [16]:
# we are trying to make feature engineering part of our pipeline
# sklearn pandas is not part of sklearn
# why use sklearn-pandas? https://stackoverflow.com/questions/39406539/sklearn-function-transformer-in-pipeline
# https://github.com/scikit-learn-contrib/sklearn-pandas
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_transformer
mapper = ColumnTransformer(transformers=[
('VarietyCat', FunctionTransformer(cleanVariety, validate=False),['Variety'])], remainder='passthrough')


In [17]:
df.__getitem__('Variety')[0]

"T's Restaurant Tantanmen "

In [18]:
mapper.fit_transform(df)

array([["T's Restaurant Tantanmen ", 'New Touch', 'Cup', 'Japan'],
       ['Noodles Spicy Hot Sesame Spicy Hot Sesame Guan-miao Noodles',
        'Just Way', 'Pack', 'Taiwan'],
       ['Cup Noodles Chicken Vegetable', 'Nissin', 'Cup', 'USA'],
       ...,
       ['Tom Yum Shrimp', 'Wai Wai', 'Pack', 'Thailand'],
       ['Tom Yum Chili Flavor', 'Wai Wai', 'Pack', 'Thailand'],
       ['Miso Ramen', 'Westbrae', 'Pack', 'USA']], dtype=object)

## Modeling using pipelines


In [19]:


# columns list for cat and num dtypes
# https://stackoverflow.com/questions/50965004/sklearn-custom-transformers-difference-between-using-functiontransformer-and-su
catFeats = df.dtypes[df.dtypes == 'object'].index.tolist()
numFeats = df.dtypes[~df.dtypes.index.isin(catFeats)].index.tolist()

# methods for Function Transformer 
def numFeat(df):
    return data[numFeats]

def catFeat(df):
    return df[catFeats]


#! need to review and learn more about FunctionTransformer
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html#sklearn.preprocessing.FunctionTransformer
# This goes without says that I also must review fit and transform
keepNum = FunctionTransformer(numFeat)
keepCat = FunctionTransformer(catFeat)

In [20]:
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

In [21]:
# preprocessing categorical pipeline
#simple Imputer is not really needed since there are nill NaNs
categoricalTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing'))])
#adding our custom function
categoricalTransformer.steps.append(['singleColTest',mapper])
# dummy cariable creation with one hot encoder
categoricalTransformer.steps.append(['ohc',OneHotEncoder(handle_unknown='ignore')])
# Dense Transformer (.todense)()) needed after onehotencoder
categoricalTransformer.steps.append(['to_dense',ToDenseTransformer()])
# we only want the PCAs with the most variance
categoricalTransformer.steps.append(['pca',PCA(n_components=3)])

In [22]:
# very usefull pipeline visualization
from sklearn import set_config
set_config(display='diagram')
categoricalTransformer

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import BayesianRidge, Lasso, Ridge
regressors = [BayesianRidge(),Lasso(), Ridge()]

In [24]:
# hyperparameter tuning for the Randomized Search
params = [{'regressors':[Lasso()],
'regressors__alpha' : [round(x,2) for x in np.linspace(start=0.01,stop =1,num=8)],
# preprocessing pipeline
'preprocessor__cat__pca__n_components':range(3,5,1),
'preprocessor__num__kbest__k':range(2,5,1)},

{'regressors':[Ridge()],
'regressors__alpha' : [round(x,2) for x in np.linspace(start=0.01,stop =1,num=8)],
# preprocessing pipeline
'preprocessor__cat__pca__n_components':range(3,5,1),
'preprocessor__num__kbest__k':range(2,5,1)
}]
# if looking for specific parameters use the get_params methods (e.g.Lasso().get_params())

In [25]:
baseModel = Pipeline([('categories', categoricalTransformer),
                      ('regressors', regressors[0])])

In [36]:
categoricalTransformer.get_params()

{'memory': None,
 'steps': [('imputer',
   SimpleImputer(fill_value='missing', strategy='constant')),
  ['singleColTest',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('VarietyCat',
                                    FunctionTransformer(func=<function cleanVariety at 0x7f8492d90310>),
                                    ['Variety'])])],
  ['ohc', OneHotEncoder(handle_unknown='ignore')],
  ['to_dense', <customPandas.ToDenseTransformer at 0x7f848a2e46d0>],
  ['pca', PCA(n_components=3)]],
 'verbose': False,
 'imputer': SimpleImputer(fill_value='missing', strategy='constant'),
 'singleColTest': ColumnTransformer(remainder='passthrough',
                   transformers=[('VarietyCat',
                                  FunctionTransformer(func=<function cleanVariety at 0x7f8492d90310>),
                                  ['Variety'])]),
 'ohc': OneHotEncoder(handle_unknown='ignore'),
 'to_dense': <customPandas.ToDenseTransformer at 0x7f848a2e46d0>,
 'pca'

In [26]:
baseModel

In [27]:
from sklearn.model_selection import train_test_split
# split of the datasla
X_train, X_test, y_train, y_test = train_test_split(df,Y,test_size=0.33, random_state=42)

In [28]:
tunedModel = RandomizedSearchCV(baseModel,params,verbose=1,n_iter=150, cv=5,random_state=42).fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


ValueError: Invalid parameter preprocessor for estimator Pipeline(steps=[('categories',
                 Pipeline(steps=[('imputer',
                                  SimpleImputer(fill_value='missing',
                                                strategy='constant')),
                                 ['singleColTest',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('VarietyCat',
                                                                   FunctionTransformer(func=<function cleanVariety at 0x7f8492d90310>),
                                                                   ['Variety'])])],
                                 ['ohc',
                                  OneHotEncoder(handle_unknown='ignore')],
                                 ['to_dense',
                                  <customPandas.ToDenseTransformer object at 0x7f848a276a60>],
                                 ['pca', PCA(n_components=3)]])),
                ('regressors', Lasso())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [31]:
'singleColTest'.get_params().keys()

AttributeError: 'str' object has no attribute 'get_params'

In [None]:
#important train of thoughts
"""
# I was super confused as there is 2 ways to do custom functions(1. Function transformer 2.TransformerMixin (subclassing)) 
https://stackoverflow.com/questions/50965004/sklearn-custom-transformers-difference-between-using-functiontransformer-and-su
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65
# 
https://stackoverflow.com/questions/39406539/sklearn-function-transformer-in-pipeline
https://stackoverflow.com/questions/38466432/for-what-is-useful-scikit-learn-functiontransformer
# good examples for pipelines
https://queirozf.com/entries/scikit-learn-pipelines-custom-pipehttps://queirozf.com/entries/scikit-learn-pipelines-custom-pipelines-and-pandas-integration#columntransformer-example-missing-imputationlines-and-pandas-integration#columntransformer-example-missing-imputation
# how I wasted alot of time
https://stackoverflow.com/questions/62079006/sklearn-pipeline-argument-of-type-columntransformer-is-not-iterable
https://stackoverflow.com/questions/28822756/getting-model-attributes-from-scikit-learn-pipeline
# don't use sklearn_pandas and stick with sklearn
https://stackoverflow.com/questions/40352176/whats-the-difference-between-sklearn-pipeline-and-dataframemapper
"""