In [105]:
import pandas as pd
import numpy as np
from itertools import chain
from sklearn.preprocessing import LabelEncoder,Imputer

In [106]:
df = pd.read_csv('./data/responses.csv')

In [107]:
df.head()

Unnamed: 0,Timestamp,Email Address,Name,Neighbourhood,Rating,Brands,Flavor,Packaging,Source,Type,Frequency,Price,Quality,Occasion,New_flavors
0,2/6/2018 19:12:51,,Neha Baranwal,Thane,10,"Mother Dairy, Amul","Chocolate, Butterscotch",Cone,"Supermarkets, Ice cream parlour/ restaurants",Unit,Once a week,7.0,10.0,5,5.0
1,2/6/2018 19:17:21,,Arvind Narayanan,Chembur,8,"Dinshaw’s, Amul, Kwality Walls","Chocolate, Vanilla, Butterscotch, Pistachio",Cone,"Ice cream parlour/ restaurants, Minimart",Unit,Once a month,7.0,8.0,10,4.0
2,2/6/2018 21:01:58,2015bhavika.adnani@ves.ac.in,Bhavika,Thane,8,"Baskin-Robbins, Amul, Kwality Walls","Chocolate, Vanilla, Coffee",Cone,"Ice cream parlour/ restaurants, Ice cream cart...",Unit,Once a month,1.0,10.0,8,6.0
3,2/6/2018 21:02:15,2015mayank.agrawal@ves.ac.in,Mayank Agrawal,Chembur,9,"Baskin-Robbins, Amul, Creambell","Chocolate, Butterscotch",Cone,Ice cream parlour/ restaurants,Unit,Once a month,2.0,10.0,9,4.0
4,2/6/2018 21:09:39,2015bhuvanesh.goplani@ves.ac.in,Bhuvanesh Goplani,Chembur,4,"Baskin-Robbins, Home made","Vanilla, Strawberry, Butterscotch",Tub,Ice cream parlour/ restaurants,Unit,Once a month,3.0,10.0,6,5.0


## Preprocessing

### Removing redundant columns

In [108]:
df.columns

Index(['Timestamp', 'Email Address', 'Name', 'Neighbourhood', 'Rating',
       'Brands', 'Flavor', 'Packaging', 'Source', 'Type', 'Frequency', 'Price',
       'Quality', 'Occasion', 'New_flavors'],
      dtype='object')

In [109]:
df = df.drop(['Timestamp', 'Email Address', 'Name', 'Type'], 1)

### Dealing with null values

In [110]:
df.isnull().sum()

Neighbourhood    6
Rating           0
Brands           1
Flavor           0
Packaging        0
Source           1
Frequency        1
Price            1
Quality          2
Occasion         0
New_flavors      1
dtype: int64

In [111]:
def convStringToCat(df, column):
    nullCols = df[column].isnull()
    df[column][nullCols] = 'NaN'
    df[column] = df[column].str.lower().str.strip()
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].values)
    df[column][nullCols] = np.nan

In [112]:
convStringToCat(df, 'Neighbourhood')
convStringToCat(df, 'Frequency')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [113]:
def removeMissingVals(col):
    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
    df[col]=imp.fit_transform(df[[col]])

removeMissingVals("Neighbourhood")
removeMissingVals("Price")
removeMissingVals("Quality")
removeMissingVals("New_flavors")
removeMissingVals("Frequency")

df[['Neighbourhood','Price','Quality', 'New_flavors', 'Frequency']].isnull().sum()

Neighbourhood    0
Price            0
Quality          0
New_flavors      0
Frequency        0
dtype: int64

### Splitting multivalued attribues

In [114]:
def fetchOneHotColumns(df, column):
    type_columns = [type_.split(", ") if type(type_) != float else [] for type_ in df[column]]
    types = list(set(chain.from_iterable(type_columns)))
    typeDict = {}
    for type_ in types:
        typeDict[type_] = []
    for type_set in type_columns:
        for type_ in typeDict:
            if type_ in type_set:
                typeDict[type_].append(1)
            else:
                typeDict[type_].append(0)
    df_types = pd.DataFrame(typeDict)
    if 'any brand ' in df_types.columns:
        df_types = df_types.drop('any brand ', axis=1)
    return types, df_types

In [115]:
multivaluedCols = ['Brands', 'Flavor', 'Source']
multivaluedColDict = {}
for col in multivaluedCols:
    types, df_types = fetchOneHotColumns(df, col)
    multivaluedColDict[col] = types
    df = df.join(df_types).drop(col, axis=1)

In [116]:
df.isnull().sum()

Neighbourhood                     0
Rating                            0
Packaging                         0
Frequency                         0
Price                             0
Quality                           0
Occasion                          0
New_flavors                       0
Amul                              0
Baskin-Robbins                    0
Creambell                         0
Dinshaw’s                         0
Gelato Italiano                   0
Havmor                            0
Home made                         0
Kwality Walls                     0
Local Icecream shops              0
London Dairy                      0
Mother Dairy                      0
Naturals                          0
Vadilal                           0
patanjali icecreame               0
Butterscotch                      0
Chocolate                         0
Coffee                            0
Custard apple                     0
Mango                             0
Mint                        

### Label encoding and one hot encoding

## Visualization

### Location wise distribution of flavors and brands

### Area wise dominating factor

### Brand vs like-scale mapping

### Source vs Location

### Brand vs Flavor

### Rating vs (Price, Quality, Occasion and )

## Prediction