In [None]:
!pip install pandas matplotlib seaborn

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")


In [None]:
df = pd.read_csv('train.csv')
df.head()

In [None]:
df.info()
df.isnull().sum()
df.dropna(inplace=True)
df['date'] = pd.to_datetime(df['date'])

In [5]:
print("Mean:", df['sales'].mean())
print("Median:", df['sales'].median())
print("Mode:", df['sales'].mode()[0])

Mean: 357.77574911261877
Median: 11.0
Mode: 0.0


In [None]:
#Sales Over Time Visualization
plt.figure(figsize=(12,6))
sns.lineplot(x='date', y='sales', data=df)
plt.title('Sales Over Time')
plt.show()

In [None]:
#Sales by Store
plt.figure(figsize=(12,6))
df.groupby('store_nbr')['sales'].sum().plot(kind='bar')
plt.title('Total Sales by Store')
plt.show()

In [None]:
#Group by month and plot
df['month'] = df['date'].dt.month
monthly_sales = df.groupby('month')['sales'].sum()

plt.figure(figsize=(10,6))
sns.lineplot(x=monthly_sales.index, y=monthly_sales.values)
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.show()

In [None]:
#Check Relationship between Numerical Columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

plt.figure(figsize=(8,6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [12]:
# Import the required libraries
import numpy as np
from scipy import stats

# Outlier Detection using Z-score
z_scores = np.abs(stats.zscore(df['sales']))
outliers = df[z_scores > 3]
print("Number of outliers:", len(outliers))

Number of outliers: 65073


In [None]:
#Distribution Analysis
plt.figure(figsize=(8,6))
sns.histplot(df['sales'], bins=50, kde=True)
plt.title('Sales Distribution')
plt.show()

In [None]:
#Store-Level Analysis
store_sales = df.groupby('store_nbr')['sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(12,6))
store_sales.plot(kind='bar')
plt.title('Total Sales by Store')
plt.show()