In [None]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')





In [None]:
# loading the dataset
df = pd.read_csv(r'D:\internship\SampleSuperstore.csv')

In [None]:
# checking the shape of the dataset
print("Shape of the dataset:", df.shape)

# checking the first five rows of the dataset
df.head()


In [None]:
# checking the data types of the columns
print("Data types of columns:\n", df.dtypes)


In [None]:
# checking the missing values in the dataset
print("\nMissing values in the dataset:\n", df.isnull().sum())

In [None]:
# checking the duplicat values in the dataset
df.duplicated().sum()

In [None]:
#This will give us the count of duplicate rows in the dataset. In this case, there are 17 duplicate rows and we will remove.
df.drop_duplicates(inplace=True)


In [None]:
df.columns

In [None]:

df.describe()

In [None]:
df['Ship Mode'].value_counts(normalize=True)

In [None]:
sns.boxplot(x='Category', y='Profit', data=df)

In [None]:
# we will check for any outliers in the numerical columns of the dataset.
num_cols = ['Sales', 'Quantity', 'Discount', 'Profit']

for col in num_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5*iqr
    upper_bound = q3 + 1.5*iqr
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(col, outliers.shape[0])

In [None]:
# Plotting Sales by Category and Sub-Category
plt.figure(figsize=(15,10))
sns.barplot(x='Category', y='Sales', hue='Sub-Category', data=df)
plt.title('Sales by Category and Sub-Category')
plt.xlabel('Category')
plt.ylabel('Sales')
plt.show()

In [None]:
# Plot histograms for all numerical columns
df.hist(bins=20, figsize=(20,10))

# Show the plot
plt.show()

In [None]:

sns.countplot(x="Category", data=df)
plt.show()
plt.xticks(rotation=90)
sns.countplot(x="Sub-Category", data=df,)
plt.show()

In [None]:
# Box plots of numerical columns
num_cols = ['Sales', 'Quantity', 'Discount', 'Profit']
fig, ax = plt.subplots(nrows=len(num_cols), figsize=(5,10))

for i, col in enumerate(num_cols):
    ax[i].boxplot(df[col])
    ax[i].set_title(col)

plt.tight_layout()
plt.show()

In [None]:
# Scatterplot
sns.scatterplot(x="Sales", y="Profit", data=df)
plt.show()

# Correlation matrix
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
segment_counts = df['Segment'].value_counts()
plt.pie(segment_counts, labels=segment_counts.index, autopct='%1.1f%%',)
plt.show()

In [None]:
# scatterplot
sns.pairplot(df, vars=['Sales', 'Profit', 'Quantity', 'Discount'])



In [None]:
plt.figure(figsize=(2,2))
sns.histplot(data=df, x='Sales', kde=True)
plt.title('Distribution of Sales')
plt.show()

plt.figure(figsize=(2,2))
sns.boxplot(data=df, x='Sales')
plt.title('Box plot of Sales')
plt.show()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(df['Sales'], model='additive', period=12)
result.plot()
plt.show()

Finding the top profitable products

In [None]:
top_products = df.groupby("Sub-Category").sum().sort_values("Profit", ascending=False)
print(top_products.head(10))

Finding the top profitable cities

In [None]:
top_cities = df.groupby("City").sum().sort_values("Profit", ascending=False)
print(top_cities.head(10))

Finding the top profitable states

In [None]:
top_states = df.groupby("State").sum().sort_values("Profit", ascending=False)
print(top_states.head(10))


Finding the top profitable segments

In [None]:
top_segments = df.groupby("Segment").sum().sort_values("Profit", ascending=False)
print(top_segments.head(10))

Finding the top profitable regions

In [None]:

top_regions = df.groupby("Region").sum().sort_values("Profit", ascending=False)
print(top_regions.head(10))

Finding the top profitable categories

In [None]:

top_categories = df.groupby("Category").sum().sort_values("Profit", ascending=False)
print(top_categories.head(10))

Finding the top loss-making products

In [None]:

loss_products = df.groupby("Sub-Category").sum().sort_values("Profit", ascending=True)
print(loss_products.head(10))

Finding the top loss-making cities

In [None]:

loss_cities = df.groupby("City").sum().sort_values("Profit", ascending=True)
print(loss_cities.head(10))

Finding the top loss-making states

In [None]:

loss_states = df.groupby("State").sum().sort_values("Profit", ascending=True)
print(loss_states.head(10))

Finding the top loss-making segments

In [None]:

loss_segments = df.groupby("Segment").sum().sort_values("Profit", ascending=True)
print(loss_segments.head(10))

Finding the top loss-making regions

In [None]:

loss_regions = df.groupby("Region").sum().sort_values("Profit", ascending=True)
print(loss_regions.head(10))

Finding the top loss-making categories

In [None]:

loss_categories = df.groupby("Category").sum().sort_values("Profit", ascending=True)
print(loss_categories.head(10))

Conclusion

Based on the exploratory data analysis, we can identify several weak areas where we can work to make more profit. These areas include:

#### Improving the sales and profit of the office supplies category
#### Addressing the negative profit margins in the central and eastern regions
#### Reducing the losses in the furniture and technology categories
#### Addressing the negative profit margins in certain states and cities
#### Analyzing the customer segments and identifying areas for improvement in sales and profit