# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Wine Customer Segmentation</p>

In [None]:
import pandas as pd

import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(10,6)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Input</p>

In [None]:
wine = pd.read_csv('../input/wine-customer-segmentation/Wine.csv')
wine.head()

# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">EDA</p>

In [None]:
wine.info()

In [None]:
wine.describe()

In [None]:
plt.figure(figsize=(15,15))
cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
sns.heatmap(wine.corr(), cmap=cmap, cbar_kws={'shrink': .8}, square=True, annot=True, fmt='.2f', linewidths=.8)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
contagem = wine['Customer_Segment'].value_counts()

ax = sns.countplot(data=wine, x='Customer_Segment')

for c in ax.patches:
    ax.text(c.get_x() + c.get_width() / 2, c.get_height() + 0.9, c.get_height(), fontsize='16')
    
ax.set_xlabel('N° of Segment', fontsize = 15)
ax.set_ylabel('Count', fontsize = 15);

In [None]:
columns = wine.columns
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.distplot(wine[i], kde=False, fit=stats.norm, color='skyblue')
    plt.title(i)

In [None]:
columns = wine.columns[:13]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.boxplot(y=wine[i], color='skyblue')
    plt.title(i)

# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Separating by Segment</p>

In [None]:
wine_1 = wine[wine['Customer_Segment'] == 1]
wine_2 = wine[wine['Customer_Segment'] == 2]
wine_3 = wine[wine['Customer_Segment'] == 3]

In [None]:
columns = wine_1.columns[:13]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.distplot(wine_1[i], kde=False, fit=stats.norm, color="orange")
    plt.title(i)

In [None]:
columns = wine_1.columns[:13]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.boxplot(y=wine_1[i], color='orange')
    plt.title(i)

In [None]:
columns = wine_2.columns[:13]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.distplot(wine_2[i], kde=False, fit=stats.norm, color="green")
    plt.title(i)

In [None]:
columns = wine_2.columns[:13]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.boxplot(y=wine_2[i], color='green')
    plt.title(i)

In [None]:
columns = wine_3.columns[:13]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.distplot(wine_3[i], kde=False, fit=stats.norm, color="purple")
    plt.title(i)

In [None]:
columns = wine_3.columns[:13]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.boxplot(y=wine_3[i], color='purple')
    plt.title(i)

In [None]:
columns = wine_3.columns[:13]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    fig = plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.distplot(x=wine_1[i], color='orange')
    sns.distplot(x=wine_2[i], color='green')        
    sns.distplot(x=wine_3[i], color='purple')
    fig.legend(labels=('Segment 1','Segment 2','Segment 3'))
    plt.title(i)

# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Model</p>

In [None]:
x = wine.drop(columns=['Customer_Segment'])
y = wine['Customer_Segment']

x_train, x_test, y_train , y_test = train_test_split(x, y, test_size=.2, stratify=y)

colunas = ['Modelo','Acuracy']
resultado = pd.DataFrame(columns=colunas)

models = []

models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
models.append(('BaggingClassifier', BaggingClassifier()))
models.append(('RandomForestClassifier', RandomForestClassifier()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
models.append(("XGBClassifier", XGBClassifier()))

for name, model in models:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    resultado = resultado.append(pd.DataFrame([[name, acc]], columns=colunas))
    
resultado.sort_values(by=['Acuracy'], ascending=False, inplace=True)
resultado