# Name: Riham Essam
# Track: Data Science and Bussiness Analytics
# Task 3: Exploratory Data Analysis - Retail

In [None]:
#Importating libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import warnings
from plotnine import *
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors

warnings.filterwarnings('ignore')

In [None]:
#Read data
store = pd.read_csv('SampleSuperstore.csv')
store.head()

In [None]:
store.shape

In [None]:
store.describe()

In [None]:
store.dtypes

In [None]:
store.info()

In [None]:
store.columns

In [None]:
store.isnull().sum()

In [None]:
#checking duplicates
store.duplicated().sum()

In [None]:
store.drop_duplicates(inplace = True)

In [None]:
#checking duplicates after dropping 
store.duplicated().sum()

In [None]:
#print shape after dropping duplicates
store.shape

In [None]:
# check unique values in each column
store.nunique() #Country is the same in all dataset.

In [None]:
#Drop unnessecary col 
store.drop(['Discount','Country','Postal Code'], axis = 1, inplace = True) 

In [None]:
store.head()

In [None]:
store.shape

In [None]:
store.describe().T

In [None]:
store.corr()

In [None]:
#representing correlation between features
plt.figure(figsize=(6,6))
hm = sns.heatmap(store.corr(), annot = True,cmap="YlGnBu")
hm.set(title = "Correlation matrix of supermarket sales data\n")

plt.show()

In [None]:
#correlation between features 
corr = store.corr()
corr.style.background_gradient(cmap='coolwarm')

# Data Visualization

In [None]:
plt.figure(figsize=(16,8))
sns.countplot(x=store['State'])
plt.xticks(rotation=80)
plt.title("State")
plt.show()

In [None]:
#pie chart to represent percentage of Ship Mode
def pie_chart(dataframe):
    
    labels = dataframe.index.values
    sizes = dataframe['Ship Mode'].values
        
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

df_Ship_Mode_count = pd.DataFrame(store['Ship Mode'].value_counts())
pie_chart(df_Ship_Mode_count)

In [None]:
#pie chart to represent percentage of Segment
def pie_chart(dataframe):
    
    labels = dataframe.index.values
    sizes = dataframe['Segment'].values
        
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

df_Segment_count = pd.DataFrame(store['Segment'].value_counts())
pie_chart(df_Segment_count)

In [None]:
store.head()

In [None]:
#Let's take a sample from the dataset as it's too large
storeDF = store.sample(n = 3500)
storeDF.head()

In [None]:
storeDF.shape

In [None]:
# Region Count
sns.countplot(store['Region'])
plt.xlabel('Branch')
plt.ylabel('Count')
plt.title('Which Region is the most busy?')

plt.show()

In [None]:
# Ship Mode Count
sns.countplot(store['Ship Mode'])

plt.xlabel('Ship Mode')
plt.ylabel('Count')
plt.title('Which Ship Mode is the most busy?')

plt.show()

In [None]:
with sns.axes_style(style='ticks'):
    g = sns.factorplot("Ship Mode", "Profit", "Segment", data=storeDF, kind="box")
    g.set_axis_labels("Ship Mode", "Profit")

In [None]:
with sns.axes_style('white'):
    sns.jointplot("Profit", "Quantity", data=storeDF, kind='hex')

In [None]:
#subplot for each col
storeDF.plot(subplots=True, figsize=(8, 8))

In [None]:
sns.pairplot(storeDF, hue='Segment', size=2.5)

In [None]:
sns.jointplot("Profit", "Quantity", data=storeDF, kind='reg')

In [None]:
sns.violinplot("Segment", "Profit", data=storeDF,
               palette=["lightblue", "red","pink"])

In [None]:
sns.kdeplot(storeDF.Profit[storeDF['Ship Mode'] =='Second Class'], label='Second Class', shade=True)
sns.kdeplot(storeDF.Profit[storeDF['Ship Mode'] =='Standard Class'], label='Standard Class', shade=True)
sns.kdeplot(storeDF.Profit[storeDF['Ship Mode'] =='First Class'], label='First Class', shade=True)
sns.kdeplot(storeDF.Profit[storeDF['Ship Mode'] =='Same Day'], label='Same Day', shade=True)

plt.xlabel('Profit')

In [None]:
ProfitPlt = (ggplot(storeDF, aes(x='Sub-Category', y='Profit', fill='Sub-Category')) + geom_col() + coord_flip()
+ scale_fill_brewer(type='div', palette="Spectral") + theme_classic())

display(ProfitPlt)

In [None]:
sns.kdeplot(storeDF.Sales[storeDF['Ship Mode'] =='Second Class'], label='Second Class', shade=True)
sns.kdeplot(storeDF.Sales[storeDF['Ship Mode'] =='Standard Class'], label='Standard Class', shade=True)
sns.kdeplot(storeDF.Sales[storeDF['Ship Mode'] =='First Class'], label='First Class', shade=True)
sns.kdeplot(storeDF.Sales[storeDF['Ship Mode'] =='Same Day'], label='Same Day', shade=True)

plt.xlabel('Sales')

In [None]:
sns.distplot(storeDF['Sales'], kde=False)
plt.axvline(0, color="k", linestyle="--")

In [None]:
sns.distplot(storeDF['Profit'], kde=False)
plt.axvline(0, color="k", linestyle="--")

In [None]:
#Creating an instance of labelEncoder
labelEncoder = LabelEncoder()
#Transforming categorical data into numbers
columnNames = ['Ship Mode','Segment','City','State','Region','Category','Sub-Category']
for colName in store.columns:
  for itr in range(len(columnNames)):
    if colName == columnNames[itr]:
      storeDF[colName] = labelEncoder.fit_transform(storeDF[colName])
storeDF.head()

In [None]:
sns.distplot(storeDF['Segment'])
sns.distplot(storeDF['State'])

In [None]:
#We can see the joint distribution and the marginal distributions together using sns.jointplot
sns.jointplot(storeDF['Segment'], storeDF['State'], storeDF, kind='kde')

In [None]:
sns.kdeplot(storeDF['City'])

In [None]:
sns.kdeplot(storeDF['State'],storeDF['Region'])

# Model

In [None]:
#Nearest Neighbours Algo
neighb = NearestNeighbors(n_neighbors=3) 
nbrs=neighb.fit(storeDF) 
distances,indices=nbrs.kneighbors(storeDF) # finding the nearest neighbours
# Sort and plot the distances results
distances = np.sort(distances, axis = 0)
distances = distances[:, 1] # taking the second column of the sorted distances
plt.rcParams['figure.figsize'] = (5,3) 
plt.plot(distances) 
plt.show() 