# Initialization!

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from matplotlib import rc
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
data= pd.read_csv("color_survey_answers.csv",  sep='\t')
data.head()

Unnamed: 0,id,user_id,datestamp,r,g,b,colorname
0,1,1,1267419000.0,72,100,175,pastel blue
1,2,1,1267419000.0,204,177,246,faint violet
2,3,1,1267419000.0,182,226,245,baby blue
3,4,1,1267419000.0,130,64,234,purple
4,5,2,1267419000.0,75,49,234,blue


In [3]:
data.columns

Index(['id', 'user_id', 'datestamp', 'r', 'g', 'b', 'colorname'], dtype='object')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3363127 entries, 0 to 3363126
Data columns (total 7 columns):
id           int64
user_id      int64
datestamp    float64
r            int64
g            int64
b            int64
colorname    object
dtypes: float64(1), int64(5), object(1)
memory usage: 179.6+ MB


In [5]:
# Conversion to datetime
from datetime import datetime
data['datestamp'] = pd.to_datetime(data['datestamp'],unit='s')
# to check the null value in the dataset
print(data.isnull().values.sum())

1734


In [6]:
data = data.fillna({"colorname": "green"}) # Missing values fill

# Extraction to new 
new = data[data['colorname'] == 'impatiens juice'].copy()
new = data[data['colorname'] == 'caribbean surf'].append(new)
new = data[data['colorname'] == 'medium tan leather'].append(new)
new = data[data['colorname'] == 'babypoo'].append(new)

In [7]:
new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 185291 to 3170468
Data columns (total 7 columns):
id           8 non-null int64
user_id      8 non-null int64
datestamp    8 non-null datetime64[ns]
r            8 non-null int64
g            8 non-null int64
b            8 non-null int64
colorname    8 non-null object
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 512.0+ bytes


In [8]:
# More occurance
new = new.append([new]*20)
new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168 entries, 185291 to 3170468
Data columns (total 7 columns):
id           168 non-null int64
user_id      168 non-null int64
datestamp    168 non-null datetime64[ns]
r            168 non-null int64
g            168 non-null int64
b            168 non-null int64
colorname    168 non-null object
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 10.5+ KB


In [9]:
# New color set
data = pd.concat([data,new],ignore_index=True)
#data['colorname'].value_counts()
# Shuffling for equal distribution
data= data.sample(frac = 1,random_state= 10).reset_index(drop = True)

In [10]:
m= data['colorname'].value_counts()
n=dict(m)
p=list(n.keys())
N=100
sub_data=data[data.colorname.isin(p[:N])]
print(sub_data.shape)

(526, 7)


# On_hot_encoding

In [11]:
demo_data=pd.get_dummies(sub_data, columns=["colorname"])
print(demo_data.shape) 

(526, 106)


In [12]:
demo_data.head()

Unnamed: 0,id,user_id,datestamp,r,g,b,colorname_ taupe,colorname_#1040cc,colorname_#3e83b8,colorname_#cc00aa,...,colorname_tickle me dead,colorname_torn licorice,colorname_tosca green,colorname_uggh,colorname_undernail,colorname_understated antagonism,colorname_undisclosed green,colorname_vegan,colorname_verde scuro,colorname_wicked
2346,2916536,130335,2010-03-26 13:27:35,205,221,195,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5704,1923396,87255,2010-03-13 13:58:04,108,45,185,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11174,3037270,135373,2010-03-28 22:24:35,79,217,101,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34760,797841,36734,2010-03-04 00:42:16,175,206,77,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34859,387591,18512,2010-03-02 14:08:01,146,240,131,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Features and Targets
y = demo_data.iloc[:,6:].values
X = demo_data.iloc[:,3:6].values

In [14]:
# Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [15]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)

print("Test_Accuracy: {:.2f}".format(clf.score(X_test, y_test)))
print("Train_Accuracy: {:.2f}".format(clf.score(X_train, y_train)))

Test_Accuracy: 0.60
Train_Accuracy: 1.00


In [16]:
k=np.array([100,220,236]).reshape(1,-1)
color_list=list(demo_data.columns[6:].values)
pred = clf.predict(k)
idx= pred.tolist()[0].index(1,0)

# Custom binary encoding

In [17]:
data['color_green'] = np.where(data['colorname'].str.contains('green'), 1, 0)
data['color_red'] = np.where(data['colorname'].str.contains('red'), 1, 0)
data['color_pink'] = np.where(data['colorname'].str.contains('pink'), 1, 0)
data['color_blue'] = np.where(data['colorname'].str.contains('blue'), 1, 0)
data['color_orange'] = np.where(data['colorname'].str.contains('orange'), 1, 0)
data['color_yellow'] = np.where(data['colorname'].str.contains('yellow'), 1, 0)
data['color_purple'] = np.where(data['colorname'].str.contains('purple'), 1, 0)
data['color_tan'] = np.where(data['colorname'].str.contains('tan'), 1, 0)

In [18]:
data.head()

Unnamed: 0,id,user_id,datestamp,r,g,b,colorname,color_green,color_red,color_pink,color_blue,color_orange,color_yellow,color_purple,color_tan
0,2102379,95339,2010-03-15 21:30:48,99,23,111,purple,0,0,0,0,0,0,1,0
1,4130,166,2010-03-01 06:31:11,191,81,74,salmony-pink,0,0,1,0,0,0,0,0
2,2644842,118740,2010-03-22 18:33:06,198,128,109,red tan,0,1,0,0,0,0,0,1
3,2405676,108250,2010-03-19 06:33:03,55,34,6,black,0,0,0,0,0,0,0,0
4,2384434,107334,2010-03-19 02:44:38,166,132,201,joy''s headband,0,0,0,0,0,0,0,0


In [19]:
data.columns

Index(['id', 'user_id', 'datestamp', 'r', 'g', 'b', 'colorname', 'color_green',
       'color_red', 'color_pink', 'color_blue', 'color_orange', 'color_yellow',
       'color_purple', 'color_tan'],
      dtype='object')

In [20]:
m= data['colorname'].value_counts()
n=dict(m)
p=list(n.keys())
N=100
data_custom=data[data.colorname.isin(p[:N])]

In [21]:
# Features and Target
y = data_custom.iloc[:,7:].values
X = data_custom.iloc[:,3:6].values
# Train Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [22]:
print("Test_Accuracy: {:.2f}".format(clf.score(X_test, y_test)))
print("Train_Accuracy: {:.2f}".format(clf.score(X_train, y_train)))

Test_Accuracy: 0.74
Train_Accuracy: 1.00


In [24]:
k=np.array([102,226,245]).reshape(1,-1)
colorlist=list(data_custom.columns[7:].values)
prediction = clf.predict(k)

# Accuracy
On hot encoding : 0.60 (Test accuracy)
Custom Binary : 0.74 (Test Accuracy)