CodeClause : Data Science
Name : Riham Essam
Project Name : Sales Analysis 

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.stats import norm
import warnings 
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import scipy.cluster.hierarchy as shc
from scipy.cluster.hierarchy import dendrogram,linkage
warnings.filterwarnings('ignore')

In [None]:
#Reading dataset
SA = pd.read_csv("SalesAnalysis.csv")
SA.head()

In [None]:
SA.columns

In [None]:
SA.dtypes

In [None]:
#Drop col that is not important
SA = SA.drop(['Invoice ID','Time','gross margin percentage'],axis = 1)
SA.head()

In [None]:
SA.isnull().sum() # To check null values

In [None]:
SA.duplicated().sum()

In [None]:
SA.info()

In [None]:
SA.corr()

In [None]:
SA.describe()

In [None]:
SA.shape

In [None]:
#feature scalling to normalize data in range 0:1
stand=MinMaxScaler()
colList = ['Quantity','Tax 5%','Unit price','Total','gross income']
for colName in SA.columns:
  for itr in range(len(colList)):
    if colName == colList[itr]:
      SA[[colName]]=stand.fit_transform(SA[[colName]])

SA.head()

Visualization

In [None]:
#representing correlation between features
plt.figure(figsize=(12,12))
hm = sns.heatmap(SA.corr(), annot = True,cmap="YlGnBu")
hm.set(title = "Correlation matrix of supermarket sales data\n")

plt.show()

In [None]:
#correlation between features 
corr = SA.corr()
corr.style.background_gradient(cmap='coolwarm')

#The best corr is Tax 5% , Gross Income and cogs('Cost of Goods Sale') of corr 1
#Tax 5% , Gross Income and cogs have a good corr of 0.71 with Quantity

In [None]:
plt.figure(figsize=(18,6),dpi=100)
sns.set(style = 'whitegrid')
sns.regplot(x='Quantity',y='cogs',data=SA,color='blue')
plt.xlabel('Quantity')
plt.ylabel('Cost of Goods Sale')
plt.title('Quantity v Cost of Goods Sale',fontsize=18)
plt.show()

In [None]:
plt.figure(figsize=(18,6),dpi=100)
sns.regplot(x="Unit price", y="gross income", data=SA,
                 scatter_kws={"color": "black"}, line_kws={"color": "red"})
plt.xlabel('Unit Price')
plt.ylabel('Gross Income')
plt.title('Unit Price vs Gross Income',fontsize=18)
plt.show()

In [None]:
#pie chart to represent percentage of gender  who bought goods
def pie_chart(dataframe):
    
    labels = dataframe.index.values
    sizes = dataframe['Gender'].values
        
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

df_city_count = pd.DataFrame(SA.Gender.value_counts())
pie_chart(df_city_count)

In [None]:
#Payment method count
sns.countplot(SA['Payment'])
plt.xlabel('Payment Method')
plt.ylabel('Count')
plt.title('Which Payment Method is most used?')

plt.show()

In [None]:
# Branch Count
sns.countplot(SA['Branch'])
plt.xlabel('Branch')
plt.ylabel('Count')
plt.title('Which Branch is the most busy?')

plt.show()

In [None]:
#City Count
sns.countplot(SA['City'])
plt.xlabel('City')
plt.ylabel('Count')
plt.title('Which City is most busy?')
Yangon,Mandalay,Naypyitow = SA.City.value_counts()

print('Yangon =',Yangon)
print('Mandalay =',Mandalay)
print('Naypyitow =',Naypyitow)
print('\n')

plt.show()

In [None]:
# The distribution of different ratings
plt.figure(dpi=100) 
rating = SA['Rating']
sns.distplot(rating, fit = norm ,kde=False)
plt.show()

In [None]:
#Product Line Count
plt.figure(dpi=120)
sns.countplot(SA['Product line'])
plt.xlabel('Product line')
plt.ylabel('Count')
plt.xticks(fontsize=5)
plt.title('Which Product line has the most sales?')


plt.show()

In [None]:
plt.figure(dpi=125)
sns.countplot(y ='Product line', hue = "City", data = SA) 
plt.xlabel('Count')
plt.ylabel('Product Line')
plt.show()

In [None]:
plt.figure(dpi=125)
sns.countplot(y ='Product line', hue = "Gender", data = SA) 
plt.xlabel('Count')
plt.ylabel('Product Line')
plt.show()

#Conc: Males are more than Female in Health and beauty, Electronic accessories, Home and 
#While females are more than males in rest of product lines

In [None]:
#Histogram of all data except date
SalesAnalysis = SA.iloc[:,SA.columns != 'Date']
SalesAnalysis.hist(figsize=(12,12))
plt.show()

In [None]:
plt.figure(dpi=125)
sns.countplot(y ='Product line', hue = "Branch", data = SA) 
plt.xlabel('Count')
plt.ylabel('Product Line')
plt.show()

#conc: Branch A is the best in Home and lifestyle
# Branch B is the best in Sports and travel
# Branch C is the best in Food and beverages

In [None]:
plt.figure(figsize=(18,6))
sns.distplot(SA['Quantity'])

In [None]:
sns.pairplot(SA,hue="Branch")
plt.show()

In [None]:
cash = SA[SA.Payment == "Cash"]
sns.countplot(x = "Gender", hue = "Branch", data = cash)

In [None]:
cash = SA[SA.Payment == "Cash"]
sns.countplot(x = "Gender", hue = "Product line", data = cash)

LabelEncoding

In [None]:
#Creating an instance of labelEncoder
labelEncoder = LabelEncoder()
#Transforming categorical data into numbers
SA["Gender"] = labelEncoder.fit_transform(SA["Gender"]) #Female = 0 & Male = 1
SA["Branch"] = labelEncoder.fit_transform(SA["Branch"]) #A = 0 , B = 1 , C = 2
SA["City"] = labelEncoder.fit_transform(SA["City"]) #Mandalay = 0 , Naypyitaw = 1 , Yangon = 2
#Electronic accessories = 0 , Fashion accessories = 1 , Food and beverages = 2 
#Health and beauty = 3 , Home and lifestyle = 4 , Sports and travel = 5
SA["Product line"] = labelEncoder.fit_transform(SA["Product line"]) 
SA["Payment"] = labelEncoder.fit_transform(SA["Payment"]) #Cash = 0 , Credit card = 1 , Ewallet = 2
SA["Customer type"] = labelEncoder.fit_transform(SA["Customer type"]) #Member = 0 , Normal = 1
#SA.head(15)

In [None]:
SalesAnalysis = SA.iloc[:,SA.columns != 'Date'] #SalesAnalysis to use models on it

Models

In [None]:
#Kmeans algo
kmeans = KMeans(n_clusters=4).fit_predict(SalesAnalysis)
#print("Labels: ",kmeans)
SalesAnalysis = np.array(SalesAnalysis)
labels = np.unique(kmeans)
for i in labels: #iterates to filter the data according to each unique class one iteration at a time 
#(filters and keeps the data points that belong to clutser label i)
    plt.scatter(SalesAnalysis[kmeans == i,0],SalesAnalysis[kmeans == i,1],label = i)
plt.title("kmeans diagram")
plt.show()

In [None]:
#Nearest Neighbours Algo
neighb = NearestNeighbors(n_neighbors=3) 
nbrs=neighb.fit(SalesAnalysis) 
distances,indices=nbrs.kneighbors(SalesAnalysis) # finding the nearest neighbours
# Sort and plot the distances results
distances = np.sort(distances, axis = 0)
distances = distances[:, 1] # taking the second column of the sorted distances
plt.rcParams['figure.figsize'] = (5,3) 
plt.plot(distances) 
plt.show() 

In [None]:
#Agglomertaive Algo
plt.figure(figsize=(16,7))
plt.title("SalesAnalysist Clusters")
cd = dendrogram(linkage(SalesAnalysis, method='ward'))