In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import charset_normalizer
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

### Load the dataset

In [None]:
with open("sales_data.csv", 'rb') as data:
    result = charset_normalizer.detect(data.read(10000))
print(result)

In [None]:
dfp = pd.read_csv('sales_data.csv', encoding='ascii')
dfp.to_csv('sales_data_utf.csv',encoding='utf-8', index=False)
df = pd.read_csv('sales_data_utf.csv', encoding='utf-8')

In [None]:
df.head()

### Inspect the dataset

In [None]:
df.shape

In [None]:
column_names = df.columns.tolist()
print(column_names)

In [None]:
df.info()

In [None]:
round(((158-141)/158)*100,2)

In [None]:
round(((158-142)/158)*100,2)

In [None]:
df.describe()

## Data Cleaning

### Handle missing values

In [None]:
df.isna().mean() * 100

In [None]:
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)

In [None]:
df.isna().mean() * 100

In [None]:
df.fillna(df.select_dtypes(include=['object', 'string']).mode().loc[0], inplace=True)

In [None]:
df.isna().mean() * 100

In [None]:
df['Sales'] = df['Sales'].fillna(df['Sales'].mean()).round()

In [None]:
df['Discount'] = df['Discount'].fillna(df['Discount'].mean())

### Correct data types

In [None]:
def try_parsing_date(text):
    for fmt in ('%Y-%m-%d', '%d/%m/%Y', '%B %d, %y', '%Y.%m.%d'):
        try:
            return pd.to_datetime(text, format=fmt)
        except (ValueError, TypeError):
            continue
    return pd.NaT

In [None]:
df['Date'] = df['Date'].apply(try_parsing_date)

In [None]:
df['year'] = df['sales_date'].dt.year

In [None]:
df['month'] = df['sales_date'].dt.month

In [None]:
df['day'] = df['sales_date'].dt.day

In [None]:
df.head()

In [None]:
df = df.set_index('sales_date')

In [None]:
df['Category'].unique()

In [None]:
df['category'].replace({'Clothingg':'Clothing', 'beauty':'Beauty', 'toys':'Toys', 'Electroncs':'Electronics', 'Home':'Home Appliances'}, inplace=True)

In [None]:
df['Category'].unique()

In [None]:
df['Region'].unique()

In [None]:
df.dtypes

In [None]:
dtype_mapping = {'Product_ID':str, 'Category':'category', 'Region':'category', 'Sales':int, 'Discount':float}

In [None]:
df = df.astype(dtype_mapping)

In [None]:
df.info()

### Remove duplicates

In [None]:
duplicates = df.duplicated()
print(f"Duplicated count: {duplicates.sum()}")

In [None]:
print(f"Number of rows before removing duplicates: {df.shape[0]}")
df.drop_duplicates(inplace=True)
print(f"Number of rows after removing duplicates: {df.shape[0]}")

## Creating Dummy Variables and Preventing Redundancy

In [None]:
df_dummies_region = pd.get_dummies(df['region'], columns=['region'],prefix='region', prefix_sep='_', dtype='int64' , drop_first=True)

In [None]:
df_dummies_region.head()

In [None]:
new_df = df.join(df_dummies_region)

In [None]:
new_df.head()

## Data Visualization with Matplotlib and Seaborn

### Distribution of Numerical Variables

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
sns.histplot(df['Sales'], kde=True, ax=axes[0, 0])
sns.histplot(df['Discount'], kde=True, ax=axes[0, 1])
sns.boxplot(df['Sales'], ax=axes[1, 0])
sns.boxplot(df['Discount'], ax=axes[1, 1])
plt.show()

### Categorical Variable Analysis

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

sns.countplot(x='Category', data=df, ax=axes[0])
sns.countplot(x='Region', data=df, ax=axes[1])

plt.show()

### Correlation Analysis

In [None]:
data=df[['Sales', 'Discount']].corr()
plot=sns.heatmap(data, annot=True, cmap='coolwarm', vmin=-1, vmax=1, xticklabels=True, yticklabels=True)
plot.set_title('Correlations Between Variables')
plt.show()

## Hypothesis Testing

In [None]:
stat, p_value = stats.shapiro(df.select_dtypes(exclude=['datetime64', 'object', 'category']))
print('Statistics=%.3f, p=%.3f' % (stat, p_value))
if p_value > 0.05:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

## Data Scaling

In [None]:
normalization_scaler = MinMaxScaler()
normalization_data = normalization_scaler.fit_transform(df.loc[:,['Sales', 'Discount']])

normalization_df = pd.DataFrame(normalization_data, columns=df.loc[:,['Sales', 'Discount']].columns)

print("Normalization:")
normalization_df.head()

In [None]:
standardization_scaler = StandardScaler()
standardization_data = standardization_scaler.fit_transform(df.loc[:,['price',	'units_sold']])
standardization_df = pd.DataFrame(standardization_data, columns=df.loc[:,['price',	'units_sold']].columns)

print("Standardization:")
standardization_df.head()

### Verify the Transformation:

In [None]:
normalization_df.describe()

## Dimensionality Reduction with PCA

### Prepare the Data:

In [None]:
normalization_scaler = MinMaxScaler()
normalization_data = normalization_scaler.fit_transform(df.loc[:,['Sales', 'Discount']])

print("Normalization:")
normalization_df.head()

In [None]:
pca = PCA(n_components=1)

# Step 2: Fit PCA on the standardized data and transform it to get the principal components.
principal_components = pca.fit_transform(normalization_data)

# Step 3: Create a DataFrame to store the principal components for easier visualization and analysis.
principal_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1'])


# Display the first few rows of the DataFrame to see the principal components.
print(principal_df.head())

### Apply PCA:

In [None]:
pca = PCA(n_components=1)

principal_components = pca.fit_transform(normalization_data)

principal_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1'])

print(principal_df.head())

In [None]:
loadings = pca.components_.T
loading_df = pd.DataFrame(loadings, columns=['Principal Component 1'], index=df.loc[:,['Sales', 'Discount']].columns)
print(loading_df)

### Visualize PCA Components:

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(loading_df, annot=True, cmap='coolwarm', center=0)
plt.title('PCA Component Loadings')
plt.show()