In [307]:
import pandas as pd
import numpy as np
from itertools import chain
from sklearn.preprocessing import LabelEncoder,Imputer
import torch
import torch.nn.functional as F

In [176]:
df = pd.read_csv('./data/responses.csv')

In [177]:
df.head()

Unnamed: 0,Timestamp,Email Address,Name,Neighbourhood,Rating,Brands,Flavor,Packaging,Source,Type,Frequency,Price,Quality,Occasion,New_flavors
0,2/6/2018 19:12:51,,Neha Baranwal,Thane,10,"Mother Dairy, Amul","Chocolate, Butterscotch",Cone,"Supermarkets, Ice cream parlour/ restaurants",Unit,Once a week,7.0,10.0,5,5.0
1,2/6/2018 19:17:21,,Arvind Narayanan,Chembur,8,"Dinshaw’s, Amul, Kwality Walls","Chocolate, Vanilla, Butterscotch, Pistachio",Cone,"Ice cream parlour/ restaurants, Minimart",Unit,Once a month,7.0,8.0,10,4.0
2,2/6/2018 21:01:58,2015bhavika.adnani@ves.ac.in,Bhavika,Thane,8,"Baskin-Robbins, Amul, Kwality Walls","Chocolate, Vanilla, Coffee",Cone,"Ice cream parlour/ restaurants, Ice cream cart...",Unit,Once a month,1.0,10.0,8,6.0
3,2/6/2018 21:02:15,2015mayank.agrawal@ves.ac.in,Mayank Agrawal,Chembur,9,"Baskin-Robbins, Amul, Creambell","Chocolate, Butterscotch",Cone,Ice cream parlour/ restaurants,Unit,Once a month,2.0,10.0,9,4.0
4,2/6/2018 21:09:39,2015bhuvanesh.goplani@ves.ac.in,Bhuvanesh Goplani,Chembur,4,"Baskin-Robbins, Home made","Vanilla, Strawberry, Butterscotch",Tub,Ice cream parlour/ restaurants,Unit,Once a month,3.0,10.0,6,5.0


## Preprocessing

### Removing redundant columns

In [178]:
df.columns

Index(['Timestamp', 'Email Address', 'Name', 'Neighbourhood', 'Rating',
       'Brands', 'Flavor', 'Packaging', 'Source', 'Type', 'Frequency', 'Price',
       'Quality', 'Occasion', 'New_flavors'],
      dtype='object')

In [179]:
df = df.drop(['Timestamp', 'Email Address', 'Name', 'Type'], 1)

### Dealing with null values

In [180]:
df.isnull().sum()

Neighbourhood    6
Rating           0
Brands           1
Flavor           0
Packaging        0
Source           1
Frequency        1
Price            1
Quality          2
Occasion         0
New_flavors      1
dtype: int64

In [181]:
def convStringToCat(df, column):
    nullCols = df[column].isnull()
    df[column][nullCols] = 'NaN'
    df[column] = df[column].str.lower().str.strip()
    le = LabelEncoder()
    le.fit(df[column].values)
    df[column] = le.transform(df[column].values)
    df[column][nullCols] = np.nan
    return le

In [182]:
classes = ['Neighbourhood', 'Frequency', 'Packaging']
labelEncoders = {}
for class_ in classes:
    labelEncoders[class_] = convStringToCat(df, class_)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [183]:
def removeMissingVals(col):
    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
    df[col]=imp.fit_transform(df[[col]])

removeMissingVals("Neighbourhood")
removeMissingVals("Price")
removeMissingVals("Quality")
removeMissingVals("New_flavors")
removeMissingVals("Frequency")

df[['Neighbourhood','Price','Quality', 'New_flavors', 'Frequency']].isnull().sum()

Neighbourhood    0
Price            0
Quality          0
New_flavors      0
Frequency        0
dtype: int64

### Splitting multivalued attribues

In [184]:
def fetchOneHotColumns(df, column):
    type_columns = [type_.split(", ") if type(type_) != float else [] for type_ in df[column]]
    types = list(set(chain.from_iterable(type_columns)))
    typeDict = {}
    for type_ in types:
        typeDict[type_] = []
    for type_set in type_columns:
        for type_ in typeDict:
            if type_ in type_set:
                typeDict[type_].append(1)
            else:
                typeDict[type_].append(0)
    df_types = pd.DataFrame(typeDict)
    if 'any brand ' in df_types.columns:
        df_types = df_types.drop('any brand ', axis=1)
    return types, df_types

In [185]:
multivaluedCols = ['Brands', 'Flavor', 'Source']
multivaluedColDict = {}
for col in multivaluedCols:
    types, df_types = fetchOneHotColumns(df, col)
    multivaluedColDict[col] = types
    df = df.join(df_types).drop(col, axis=1)

In [186]:
df.isnull().sum()

Neighbourhood                     0
Rating                            0
Packaging                         0
Frequency                         0
Price                             0
Quality                           0
Occasion                          0
New_flavors                       0
Amul                              0
Baskin-Robbins                    0
Creambell                         0
Dinshaw’s                         0
Gelato Italiano                   0
Havmor                            0
Home made                         0
Kwality Walls                     0
Local Icecream shops              0
London Dairy                      0
Mother Dairy                      0
Naturals                          0
Vadilal                           0
patanjali icecreame               0
Butterscotch                      0
Chocolate                         0
Coffee                            0
Custard apple                     0
Mango                             0
Mint                        

### Label encoding and one hot encoding

## Visualization

### Location wise distribution of flavors and brands

### Area wise dominating factor

### Brand vs like-scale mapping

### Source vs Location

### Brand vs Flavor

### Rating vs (Price, Quality, Occasion and )

## Prediction

### Categorical embeddings

In [246]:
def getEmbeddings(inputSize, embeddingSize):
    return torch.nn.Embedding(inputSize, embeddingSize)

In [294]:
def getIndicesDict(df, column):
    return {j:i for i, j in zip(range(len(df[column].unique())), df[column].unique())}

In [248]:
embeddings = {col: getEmbeddings(len(indices[col]), (len(indices[col]) + 1) // 2) for col in catColumns}

In [249]:
embeddings

{'Neighbourhood': Embedding(17, 9), 'Packaging': Embedding(4, 2)}

In [250]:
embeddings['Packaging'](torch.autograd.Variable(torch.LongTensor([1])))

Variable containing:
-0.6498 -0.6406
[torch.FloatTensor of size 1x2]

In [295]:
def getEmbAndIndices(df):
    catColumns = ['Neighbourhood', 'Packaging']
    indices = {col: getIndicesDict(df, col) for col in catColumns}
    embeddings = {col: getEmbeddings(len(indices[col]), (len(indices[col]) + 1) // 2) for col in catColumns}
    indices = {col: getIndicesDict(df, col) for col in catColumns}
    multivalCols = {'brand': ['Amul', 'Baskin-Robbins', 'Creambell','Dinshaw’s', 'Gelato Italiano ', 
                     'Havmor', 'Home made', 'Kwality Walls',
                    'Local Icecream shops', 'London Dairy', 'Mother Dairy', 'Naturals', 
                     'Vadilal', 'patanjali icecreame'],
                   'flavor': ['Butterscotch', 'Chocolate', 'Coffee', 'Custard apple ', 
                    'Mango', 'Mint', 'Oreo', 'Pistachio', 'Strawberry',
                    'Vanilla', 'mango ', 'sabudaana']}
    for multivalCol in multivalCols:
        indices[multivalCol] = {col:i for i,col in zip(range(len(multivalCols[multivalCol])), multivalCols[multivalCol])}
        embeddings[multivalCol] = getEmbeddings(len(multivalCols[multivalCol]), (len(multivalCols[multivalCol]) + 1) // 2)
    return embeddings, multivalCols

In [296]:
embeddings, multivalCols = getEmbAndIndices(df)

In [298]:
def getEmbeddedrow(row, embeddings, multivalCols):
    x = embeddings['Neighbourhood'](torch.autograd.Variable(torch.LongTensor([indices['Neighbourhood'][row[-2]]])))
    y = embeddings['Packaging'](torch.autograd.Variable(torch.LongTensor([indices['Packaging'][row[-1]]])))
    x = torch.cat([x, y], dim=1)
    cumulative = 0
    for i, multivalCol in enumerate(multivalCols):
        length = len(multivalCols[multivalCol])
        for j, cat in enumerate(row[cumulative:cumulative + length]):
            if cat == 1:
                y = embeddings[multivalCol](torch.autograd.Variable(torch.LongTensor([j])))
            else:
                y = torch.autograd.Variable(torch.FloatTensor(torch.zeros(1, embeddings[multivalCol].embedding_dim)), requires_grad=False)
            x = torch.cat([x, y], dim=1)
    return x

In [None]:
x_numpy = df[ multivalCols['brand'] + multivalCols['flavor'] + ["Neighbourhood", "Packaging"]].as_matrix()

In [279]:
X = torch.autograd.Variable(torch.FloatTensor(181, 1))

In [301]:
x_numpy = df[ multivalCols['brand'] + multivalCols['flavor'] + ["Neighbourhood", "Packaging"]].as_matrix()
X = getEmbeddedrow(x_numpy[0], embeddings, multivalCols)
for row in x_numpy[1:]:
    Y = getEmbeddedrow(row, embeddings, multivalCols)
    X = torch.cat([X, Y])

In [333]:
class DeepNeuroModel(torch.nn.Module):
    def __init__(self):
        super(DeepNeuroModel, self).__init__()
        self.l1 = torch.nn.Linear(181, 210)
        self.l2 = torch.nn.Linear(210, 50)
        self.l3 = torch.nn.Linear(50, 10)
        self.criterion = torch.nn.BCELoss(size_average=True)
        self.optimizer = torch.optim.SGD(self.parameters(), lr=0.01)
        self.embeddings, self.multivalCols = getEmbAndIndices(df)

    def forward(self, x_numpy):
        X = getEmbeddedrow(x_numpy[0], embeddings, multivalCols)
        for row in x_numpy[1:]:
            Y = getEmbeddedrow(row, embeddings, multivalCols)
            X = torch.cat([X, Y])  
        out1 = F.relu(self.l1(X))
        out2 = F.relu(self.l2(out1))
        y_pred = F.sigmoid(self.l3(out2))
        return y_pred

    def optimize(self, y, y_pred, epoch):
        loss = self.criterion(y_pred, y)
        if epoch % 50 == 0:
            print("Loss =", loss.data[0])
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [334]:
def one_hot(batch,depth):
    ones = torch.sparse.torch.eye(depth)
    return ones.index_select(0,batch)

In [None]:
x_numpy = df[ multivalCols['brand'] + multivalCols['flavor'] + ["Neighbourhood", "Packaging"]].as_matrix()
y = torch.autograd.Variable(one_hot(torch.LongTensor((df['Rating'] - 1).tolist()), 10), requires_grad=False)

model = DeepNeuroModel()

for i in range(500):
    y_pred = model.forward(x_numpy)
    model.optimize(y, y_pred, i)

Loss = 0.7073090076446533
Loss = 0.6855776309967041
Loss = 0.6670135259628296
Loss = 0.6494055390357971
Loss = 0.6309511661529541
Loss = 0.6100621223449707
Loss = 0.5848496556282043
Loss = 0.5538166761398315
