# Introduction
From datset, I try to explore and visualize the feature which can categorize mushroom 

In this kernel (07-01-2018),

 - Explore the dataset
 - Visualize feature
 - k nearest neighbors


# Simply Explore Mushrooms

In [None]:
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from pylab import rcParams
rcParams['figure.figsize'] = 10, 15
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [None]:
mushrooms = pd.read_csv('../input/mushrooms.csv')

## Attribute Information: 

**classes** : edible=e, poisonous=p <br>
**cap-shape** : bell=b,conical=c, convex=x, flat=f, knobbed=k, sunken=s <br>
**cap-surface** : fibrous=f, grooves=g,scaly=y, smooth=s <br>
**cap-color** : brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y <br>
**bruises** : bruises=t, no=f <br>
**odor** : almond=a, anise=l,creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s<br>
**gill-attachment** : attached=a, descending=d, free=f, notched=n<br>
**gill-spacing** : close=c, crowded=w, distant=d<br>
**gill-size** : broad=b, narrow=n<br>
**gill-color** : black=k, brown=n, buff=b, chocolate=h, gray=g, green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y<br>
**stalk-shape** : enlarging=e, tapering=t<br>
**stalk-root**: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=?<br>
**stalk-surface-above-ring** : fibrous=f, scaly=y, silky=k, smooth=s<br>
**stalk-surface-below-ring** : fibrous=f, scaly=y, silky=k, smooth=s<br>
**stalk-color-above-ring** : brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y<br>
**stalk-color-below-ring** : brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y<br>
**veil-type** : partial=p, universal=u<br>
**veil-color** : brown=n, orange=o, white=w, yellow=y<br>
**ring-number** : none=n, one=o, two=t<br>
**ring-type** : cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z<br>
**spore-print-color** : black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y<br>
**population** : abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y<br>
**habitat** : grasses=g, eaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d<br>

In [None]:
mushrooms.head()

## Exploratory Data Analysis

Get to know more about "**mushrooms**"

In [None]:
mushrooms.info()

In [None]:
mushrooms.describe()

In [None]:
mushrooms.isnull().sum()

### "mushrooms" :
 
 
 * **8,124 members<br>**
 * **23 features for each member <br>**
 * **Moreover, not has any member contain null!!**
                

We need to make some " **label encoding** "<br> for getting closer to " **mushrooms** "


In [None]:
Labledmushrooms = pd.DataFrame()

for col in mushrooms.columns:
    Labledmushrooms[col] = LabelEncoder().fit_transform(mushrooms[col])

Labledmushrooms.head()

## Everything's ready it's time to learn from "mushrooms"

### Start with correlation between features 

    How each feature relate to eachother

In [None]:
with plt.xkcd():
#correlation map
    f,ax = plt.subplots(figsize=(18, 18))
    g = sns.heatmap(Labledmushrooms.corr(), annot=True, linewidths=.5, fmt= '.1f',cmap = "coolwarm",ax=ax)

#### From correlation heatmap
** This is 5 top features which relate to "class"  ** 
- bruise 
- gill-color
- stalk-root
- ring-type
- gill-size

In [None]:
sorted(mushrooms['bruises'].unique())

## bruises :
**0** = not-bruise<br>
**1** = bruise<br>

In [None]:
g = sns.factorplot(x="bruises",y="class",data=Labledmushrooms,kind="bar", size = 6,  palette =["lavender","darkslateblue"], legend = True)
g.despine(left=True)

g.set(xticks=range(0,2), xticklabels=["not-bruise","bruise"])
g = g.set_ylabels("poisonous probability")

**From this plot, we can conclude that not brusing mushroom is very dangerous**

In [None]:
sorted(mushrooms['gill-color'].unique())

## gill-color :
**0** = buff<br>
**1** = red<br>
             **2** = gray<br>
             **3** = chocolate<br>
             **4** = black<br>
             **5** = brown<br>
             **6** = orange<br>
             **7** = pink<br>
             **8** = green<br>
             **9** = purple<br>
             **10** = white<br>
             **11** = yellow 

In [None]:
gillcolor = ["khaki","Red","lightGrey","chocolate","Black","saddleBrown","orange","lightpink","limegreen","orchid","whitesmoke","Yellow"]
gillname =["buff","red","gray","chocolate","black","brown","orange","pink","green","purple","white","yellow"]

In [None]:
counts = mushrooms['gill-color'].value_counts(sort = True)
counts

** start with looking at our dataset **

In [None]:

labels = ["buff","pink","white","brown","gray","chocolate","purple","black","red","yellow","orange","green"]
sizes = counts
colors = ["khaki","pink","whitesmoke","saddlebrown","lightgrey","chocolate","orchid","black","red","yellow","orange","limegreen"]
explode = (0.1,0.05, 0.05, 0,0,0,0,0,0,0,0,0)
 
# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140,)
plt.axis('equal')
plt.figtext(.5,.8,'Mushroom gill-color',fontsize=30,ha='center')
plt.show()


**Top 3 colors appear in our dataset**
- buff
- pink
- white<br>

**Bottom 3 colors appear in our dataset**
- green
- orange
- yellow


### Let's see, Poisonous probability VS gill-color


In [None]:
g = sns.factorplot(x="gill-color",y="class",data=Labledmushrooms, kind="bar", size = 10 ,
palette = gillcolor)
g.despine(left=True)
g.set_xticklabels(rotation=30)
g.set( xticks=range(0,12),xticklabels=gillname)
g = g.set_ylabels("poisonous probability")

**The most dangerous is the gill-color that most appear in our dataset !!**<br>

    I wouldn’t be surprised that black and brown gill-color are safety 
    but for red gill-color it should be dangerous mushroom,isn't it?


In [None]:
mushrooms[mushrooms['gill-color'] == 'e'].count().unique()

In [None]:
mushrooms[mushrooms['gill-color'] == 'e'].head()

**Okay, I give up**<br>

    96 mushrooms in our dataset tell that red-gill mushroom is edible

### Let's Look between bruises and gill-color

In [None]:
g = sns.factorplot("gill-color", col="bruises",  data=Labledmushrooms,
                   size=6, kind="count", palette =gillcolor)
g.despine(left=True)

g.set( xticks=range(0,12),xticklabels=gillname)
g.set_xticklabels(rotation=30)
g = g.set_ylabels("Count")

**Most of safety gill-color mushroom appear a bruises 
<br> while dangerous gill-color mushroom not appear a bruises **

    We can conclude that bruise and gill-color are improtant features to categorize mushroom

In [None]:
sorted(mushrooms['stalk-root'].unique())

## stalk-root :
**0** = missing<br>
**1** = bulbous<br>
             **2** = club<br>
             **3** = equal<br>
             **4** = rooted<br>

In [None]:
g = sns.factorplot(x="stalk-root",y="class",data=Labledmushrooms,kind="bar", size = 6,  palette ="cubehelix" )
g.despine(left=True)
g.set(xticks=range(0,5), xticklabels=["missing","bulbous","club","equal","rooted"])
g = g.set_ylabels("poisonous probability")

** We found that missing stalk-root mushroom has high chance to be poisonous mushroom<br>
and rooted mushroom has no chance to be poisonous mushroom**


In [None]:
sorted(mushrooms['ring-type'].unique())

## ring-type  :
**0** = evanescent<br>
             **1** = flaring<br>
             **2** = large<br>
             **3** = none<br>
             **4** = pendant<br>

In [None]:
g = sns.factorplot(x="ring-type",y="class",data=Labledmushrooms,kind="bar", size = 6,  palette ="BrBG" )
g.despine(left=True)
g.set(xticks=range(0,5), xticklabels=["evanescent","flaring","large","none","pendant"])
g = g.set_ylabels("poisonous probability")

**100%** poisonous probability? <br> 
    
    Okay, we should not eat large and none ring-type mushroom

### Let's Look between bruises and ring-type

In [None]:
g = sns.factorplot("ring-type", col="bruises",  data=Labledmushrooms,
                   size=6, kind="count", palette="muted")
g.despine(left=True)
g.set(xticks=range(0,5), xticklabels=["evanescent","flaring","large","none","pendant"])
g = g.set_ylabels("Count")

**Most of buises mushroom have a pendant ring-type which is very low poisonous probability ring-type <br>**

    "bruises" still be one of main feature to categorize mushroom

In [None]:
sorted(mushrooms['gill-size'].unique())

## gill-size  :
**0** = broad<br>
**1** =  narrow<br>

In [None]:
g = sns.factorplot("gill-size",  data=Labledmushrooms,
                   size=6, kind="count", palette ="BrBG" )
g.despine(left=True)

g.set( xticks=range(0,2), xticklabels=["broad","narrow"])
g.set_xticklabels(rotation=30)
g = g.set_ylabels("Count")

around **75 percent** of our dataset is a broad gill mushroom

In [None]:
g = sns.factorplot(x="gill-size",y="class",data=Labledmushrooms,kind="bar", size = 6,  palette =["thistle","darkviolet"] )
g.despine(left=True)
g.set(xticks=range(0,2), xticklabels=["broad","narrow"])
g = g.set_ylabels("poisonous probability")

**Similar to "bruises", gill-size may be one of importance feature to categorize mushroom<br>**

    Although, 75% of our data is broad gill mushroom it still give a low poisonous probability
       while narrow gill mushroom has a very high chance to be poisonous mushroom

#  Make a prediction

In [None]:
labels =["edible","poisonous"]
sizes = Labledmushrooms['class'].value_counts(sort = True)
colors = ["peru","mediumorchid"]
explode = (0.1,0)  
 
# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140,)
rcParams['figure.figsize'] = 13,10
plt.axis('equal')
plt.figtext(.5,.9,'Mushrooms',fontsize=30,ha='center')
plt.show()

In [None]:
Labledmushrooms['is_train'] = np.random.uniform(0, 1, len(Labledmushrooms)) <= .75

# View the top 5 rows
Labledmushrooms.head()

In [None]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = Labledmushrooms[Labledmushrooms['is_train']==True], Labledmushrooms[Labledmushrooms['is_train']==False]

train["class"] = train["class"].astype(int)

Y_train = train["class"]
X_train = train.drop(labels = ["class"],axis = 1)
Y_test = test["class"]
X_test = test.drop(labels = ["class"],axis = 1)

** I split our mushrooms to 2 part **
<br> *75% for train dataset *
<br> *and 25% for test dataset *

In [None]:
print(len(train))

In [None]:
print(len(test))

## k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

train_acc = []
test_acc = []

# try n_neighbors from 1 to 10
n_range = range(1, 11)

for neighbors in n_range:
    
    clf = KNeighborsClassifier(n_neighbors= neighbors)
    clf.fit(X_train, Y_train)

    train_acc.append(clf.score(X_train, Y_train))
    test_acc.append(clf.score(X_test, Y_test))
    
plt.plot(n_range, train_acc, label="training accuracy")
plt.plot(n_range, test_acc, label="test accuracy")

plt.ylabel("Accuracy")
plt.xlabel("neighbors")
plt.legend()


**Thank you for reading until the end : ) ** 

    I will try to update new version
    please vote or comment If you like it ^_^
    If you have any suggestion let me know in comment.