#1. Import libraries and load the dataset

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df_store = pd.read_csv(r'D:\Topic_13_Project\Topic_13_Retail_Store_Sales_Time_Series\data\raw\stores.csv')

#2. Display basic information about the dataset

In [None]:
print("First 5 Rows of Data Frame:\n", df_store.head(5))
print("Data Frame Shape:\n", df_store.shape)
print("Data Frame Info:\n", df_store.info())
print("Data Frame Statistics:\n", df_store.describe())

First 5 Rows of Data Frame:
    store_nbr           city                           state type  cluster  \
0          1          Quito                       Pichincha    D       13   
1          2          Quito                       Pichincha    D       13   
2          3          Quito                       Pichincha    D        8   
3          4          Quito                       Pichincha    D        9   
4          5  Santo Domingo  Santo Domingo de los Tsachilas    D        4   

   type_encoded  
0             3  
1             3  
2             3  
3             3  
4             3  
Data Frame Shape:
 (54, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   store_nbr     54 non-null     int64 
 1   city          54 non-null     object
 2   state         54 non-null     object
 3   type          54 non-null     object
 4   cluster       

#3. Missing Values

In [None]:
# Calculate missing values
missing_counts = df_store.isnull().sum()
missing_percentage = missing_counts / len(df_store) * 100

# Create a summary DataFrame
missing_df_store = pd.DataFrame({
    'Missing Values': missing_counts,
    'Percentage': missing_percentage
})

print("Missing Values Summary:\n", missing_df_store)

# Filter columns with missing values
missing_df_store = missing_df_store[missing_df_store['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

Missing Values Summary:
            Missing Values  Percentage
store_nbr               0         0.0
city                    0         0.0
state                   0         0.0
type                    0         0.0
cluster                 0         0.0


## 4. Store Type Distribution Analysis

Analyzing how stores are distributed across different types and what this means for business strategy.

In [None]:
# Store Type Distribution
type_counts = df_store['type'].value_counts().sort_index()
type_percentages = (type_counts / len(df_store)) * 100

store_type_dist = pd.DataFrame({
    'Store Type': type_counts.index,
    'Count': type_counts.values,
    'Percentage': [f'{pct:.1f}%' for pct in type_percentages.values]
})

print("Store Type Distribution:")
print(store_type_dist.to_string(index=False))

print(f"\nðŸ“Š Store Type Insights:")
print(f"- Most common type: Type {type_counts.idxmax()} with {type_counts.max()} stores ({type_percentages.max():.1f}%)")
print(f"- Least common type: Type {type_counts.idxmin()} with {type_counts.min()} stores ({type_percentages.min():.1f}%)")
print(f"- Distribution is RELATIVELY BALANCED across types (no single type dominates >35%)")

## 5. Geographic Concentration Analysis

Analyzing how stores are distributed geographically across cities and states.

#4. Encoding

In [None]:
# Use Label Encoding for the 'type' column (because it's alphabetically ordered A, B, C, D, E).
le = LabelEncoder()
df_store['type_encoded'] = le.fit_transform(df['type'])

# 5. Ensure the data type is correct.

In [None]:
df_store['store_nbr'] = df['store_nbr'].astype(int)
df_store['cluster'] = df['cluster'].astype(int)

# 6. Export the cleaned data to a new file.

In [None]:
output_file = 'stores_cleaned.csv'
df_store.to_csv(output_file, index=False, encoding='utf-8')
