In [1]:
import pandas as pd
import numpy as np
from itertools import chain
from sklearn.preprocessing import LabelEncoder,Imputer

In [2]:
df = pd.read_csv('./data/responses.csv')

In [3]:
df.head()

Unnamed: 0,Timestamp,Email Address,Name,Neighbourhood,Rating,Brands,Flavor,Packaging,Source,Type,Frequency,Price,Quality,Occasion,New_flavors
0,2/6/2018 19:12:51,,Neha Baranwal,Thane,10,"Mother Dairy, Amul","Chocolate, Butterscotch",Cone,"Supermarkets, Ice cream parlour/ restaurants",Unit,Once a week,7.0,10.0,5,5.0
1,2/6/2018 19:17:21,,Arvind Narayanan,Chembur,8,"Dinshaw’s, Amul, Kwality Walls","Chocolate, Vanilla, Butterscotch, Pistachio",Cone,"Ice cream parlour/ restaurants, Minimart",Unit,Once a month,7.0,8.0,10,4.0
2,2/6/2018 21:01:58,2015bhavika.adnani@ves.ac.in,Bhavika,Thane,8,"Baskin-Robbins, Amul, Kwality Walls","Chocolate, Vanilla, Coffee",Cone,"Ice cream parlour/ restaurants, Ice cream cart...",Unit,Once a month,1.0,10.0,8,6.0
3,2/6/2018 21:02:15,2015mayank.agrawal@ves.ac.in,Mayank Agrawal,Chembur,9,"Baskin-Robbins, Amul, Creambell","Chocolate, Butterscotch",Cone,Ice cream parlour/ restaurants,Unit,Once a month,2.0,10.0,9,4.0
4,2/6/2018 21:09:39,2015bhuvanesh.goplani@ves.ac.in,Bhuvanesh Goplani,Chembur,4,"Baskin-Robbins, Home made","Vanilla, Strawberry, Butterscotch",Tub,Ice cream parlour/ restaurants,Unit,Once a month,3.0,10.0,6,5.0


## Preprocessing

### Removing redundant columns

In [4]:
df.columns

Index(['Timestamp', 'Email Address', 'Name', 'Neighbourhood', 'Rating',
       'Brands', 'Flavor', 'Packaging', 'Source', 'Type', 'Frequency', 'Price',
       'Quality', 'Occasion', 'New_flavors'],
      dtype='object')

In [5]:
df = df.drop(['Timestamp', 'Email Address', 'Name'], 1)

### Dealing with null values

In [6]:
df.isnull().sum()

Neighbourhood    6
Rating           0
Brands           1
Flavor           0
Packaging        0
Source           1
Type             1
Frequency        1
Price            1
Quality          2
Occasion         0
New_flavors      1
dtype: int64

In [7]:
df.Neighbourhood[df.Neighbourhood.isnull()] = 'NaN'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [8]:
df.Neighbourhood = df.Neighbourhood.str.lower().str.strip()
df.Neighbourhood

0               thane
1             chembur
2               thane
3             chembur
4             chembur
5          ulhasnagar
6             chembur
7          ulhasnagar
8             chembur
9          ulhasnagar
10            chembur
11            chembur
12            chembur
13    vikholri,mumbai
14         ulhasnagar
15              thane
16             kalyan
17         kanjurmarg
18            chembur
19         ulhasnagar
20            chembur
21            chembur
22                nan
23             kalyan
24              dadar
25         ulhasnagar
26         california
27         vile parle
28           dombivli
29          ghatkopar
30            chembur
31          ghatkopar
32         ulhasnagar
33        lower parel
34              thane
35            govandi
36                nan
37              kurla
38              thane
39              thane
40            chembur
41              nerul
42            chembur
43              thane
44              thane
45        

In [9]:
le = LabelEncoder()

In [10]:
df[df.Neighbourhood == 'nan']

Unnamed: 0,Neighbourhood,Rating,Brands,Flavor,Packaging,Source,Type,Frequency,Price,Quality,Occasion,New_flavors
22,,8,"Baskin-Robbins, Mother Dairy, Amul, Vadilal","Chocolate, Coffee, Mint",Cone,,,Once a week,8.0,10.0,10,10.0
36,,8,"Baskin-Robbins, Mother Dairy, Dinshaw’s, Havmo...","Chocolate, Butterscotch, Coffee, Pistachio",Cone,"Supermarkets, Ice cream parlour/ restaurants, ...",Unit,3-4 times a week,8.0,9.0,8,9.0
46,,9,Amul,"Chocolate, Butterscotch",Cone,Ice cream parlour/ restaurants,Bulk,Once a week,5.0,10.0,5,5.0
47,,8,"Dinshaw’s, Amul","Chocolate, Butterscotch",Cone,Ice cream parlour/ restaurants,Unit,Once a month,5.0,8.0,3,5.0
48,,6,Baskin-Robbins,"Chocolate, Vanilla, Strawberry, Butterscotch, ...",Cone,Convenience Stores,Unit,Once a month,10.0,10.0,10,10.0
54,,10,"Baskin-Robbins, Amul, Vadilal","Chocolate, Coffee, Pistachio",Cone,Ice cream parlour/ restaurants,Bulk,3-4 times a week,1.0,10.0,5,5.0


In [11]:
df.Neighbourhood = le.fit_transform(df.Neighbourhood.values)

In [12]:
df.Neighbourhood[df.Neighbourhood == 11] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
df.Neighbourhood

0     14.0
1      2.0
2     14.0
3      2.0
4      2.0
5     15.0
6      2.0
7     15.0
8      2.0
9     15.0
10     2.0
11     2.0
12     2.0
13    16.0
14    15.0
15    14.0
16     7.0
17     8.0
18     2.0
19    15.0
20     2.0
21     2.0
22     NaN
23     7.0
24     3.0
25    15.0
26     1.0
27    17.0
28     4.0
29     5.0
30     2.0
31     5.0
32    15.0
33    10.0
34    14.0
35     6.0
36     NaN
37     9.0
38    14.0
39    14.0
40     2.0
41    12.0
42     2.0
43    14.0
44    14.0
45     7.0
46     NaN
47     NaN
48     NaN
49    14.0
50    13.0
51     4.0
52     2.0
53     0.0
54     NaN
55    15.0
56    14.0
57    15.0
Name: Neighbourhood, dtype: float64

In [14]:
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
df["Neighbourhood"]=imp.fit_transform(df[["Neighbourhood"]])

df.Neighbourhood.isnull().sum()

0

### Splitting multivalued attribues

In [20]:
brand_columns = [brand.split(", ") for brand in df['Brands'] if type(brand) != float] 

In [22]:
brands = list(set(chain.from_iterable(brand_columns)))

In [23]:
brandDict = {}
for brand in brands:
    brandDict[brand] = []

In [24]:
brandDict

{'Amul': [],
 'Baskin-Robbins': [],
 'Creambell': [],
 'Dinshaw’s': [],
 'Gelato Italiano ': [],
 'Havmor': [],
 'Home made': [],
 'Kwality Walls': [],
 'Local Icecream shops': [],
 'London Dairy': [],
 'London Dairy ': [],
 'Mother Dairy': [],
 'Naturals': [],
 'Vadilal': [],
 'any brand ': [],
 'patanjali icecreame': []}

### Label encoding and one hot encoding

## Visualization

### Location wise distribution of flavors and brands

### Area wise dominating factor

### Brand vs like-scale mapping

### Source vs Location

### Brand vs Flavor

### Rating vs (Price, Quality, Occasion and )

## Prediction