# Task for Today  

***

## Ice Cream Rating Prediction  

Given *data about various ice creams and their ingredients*, let's try to predict the **average user rating** of a given ice cream.  
  
We will use a linear regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import re
from nltk.stem import PorterStemmer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [None]:
data = pd.read_csv('../input/ice-cream-dataset/combined/products.csv')

In [None]:
data

# Preprocessing

In [None]:
data = data.drop(['key', 'name', 'subhead', 'description'], axis=1)

In [None]:
data

In [None]:
data = data.drop(data.query('rating_count < 10').index, axis=0).reset_index(drop=True)

In [None]:
data = data.drop('rating_count', axis=1)

In [None]:
data

In [None]:
def process_ingredients(ingredients):
    ps = PorterStemmer()
    new_ingredients = re.sub(r'\(.*?\)', '', ingredients)
    new_ingredients = re.sub(r'CONTAINS:.*$', '', new_ingredients)
    new_ingredients = re.sub(r'\..*?:', ',', new_ingredients)
    new_ingredients = re.sub(r'( AND/OR )', ',', new_ingredients)
    new_ingredients = re.sub(r'( AND )', ',', new_ingredients)
    new_ingredients = new_ingredients.split(',')
    for i in range(len(new_ingredients)):
        new_ingredients[i] = new_ingredients[i].replace('†', '').replace('*', ' ').replace(')', '').replace('/', ' ')
        new_ingredients[i] = re.sub(r'^.+:', '', new_ingredients[i])
        new_ingredients[i] = ps.stem(new_ingredients[i].strip())
        if new_ingredients[i] == 'milk fat':
            new_ingredients[i] = 'milkfat'
    return new_ingredients

In [None]:
# Add all unique ingredients to all_ingredients

all_ingredients = set()

for row in data.iterrows():
    ingredients = process_ingredients(data.loc[row[0], 'ingredients'])
    for ingredient in ingredients:
        if ingredient not in all_ingredients:
            all_ingredients.add(ingredient)

all_ingredients.remove('')

In [None]:
all_ingredients

In [None]:
data

In [None]:
y = data.loc[:, 'rating']
X = data.drop('rating', axis=1)

In [None]:
X

In [None]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
X = onehot_encode(X, 'brand', 'b')

In [None]:
X

In [None]:
X['ingredients'] = X['ingredients'].apply(process_ingredients)

In [None]:
X

In [None]:
ingredients_df = X['ingredients']
ingredients_df

In [None]:
mlb = MultiLabelBinarizer()

ingredients_df = pd.DataFrame(mlb.fit_transform(ingredients_df))

In [None]:
ingredients_df

In [None]:
X = pd.concat([X, ingredients_df], axis=1)
X = X.drop('ingredients', axis=1)

In [None]:
X

In [None]:
y

# Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

## Without Regularization

In [None]:
model = LinearRegression()

model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

## With L2 (Ridge) Regression

In [None]:
l2_model = Ridge(alpha=1000.0)

l2_model.fit(X_train, y_train)

In [None]:
l2_model.score(X_test, y_test)

## With L1 (Lasso) Regression

In [None]:
l1_model = Lasso(alpha=0.1)

l1_model.fit(X_train, y_train)

In [None]:
l1_model.score(X_test, y_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/Wz6oUPkeZvY