In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Walmart_Sales.csv')

# Display the first few rows to understand the data
print("Initial Dataset Preview:")
print(df.head())

# Display basic information about the dataset
print("\nDataset Info:")
print(df.info())

Initial Dataset Preview:
   Store        Date  Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price  \
0      1  05-02-2010    1643690.90             0        42.31       2.572   
1      1  12-02-2010    1641957.44             1        38.51       2.548   
2      1  19-02-2010    1611968.17             0        39.93       2.514   
3      1  26-02-2010    1409727.59             0        46.63       2.561   
4      1  05-03-2010    1554806.68             0        46.50       2.625   

          CPI  Unemployment  
0  211.096358         8.106  
1  211.242170         8.106  
2  211.289143         8.106  
3  211.319643         8.106  
4  211.350143         8.106  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   floa

In [3]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64


In [5]:
# Check for duplicate rows
print("\nNumber of Duplicate Rows:", df.duplicated().sum())

# Remove duplicates if any
df = df.drop_duplicates()

# Confirm duplicates are removed
print("Number of Duplicate Rows After Removal:", df.duplicated().sum())


Number of Duplicate Rows: 0
Number of Duplicate Rows After Removal: 0


In [7]:
# Rename columns to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Verify new column names
print("\nUpdated Column Names:")
print(df.columns)


Updated Column Names:
Index(['store', 'date', 'weekly_sales', 'holiday_flag', 'temperature',
       'fuel_price', 'cpi', 'unemployment'],
      dtype='object')


In [9]:
# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')

# Verify date conversion
print("\nDate Column Data Type:")
print(df['date'].dtype)
print("\nSample Dates:")
print(df['date'].head())


Date Column Data Type:
datetime64[ns]

Sample Dates:
0   2010-02-05
1   2010-02-12
2   2010-02-19
3   2010-02-26
4   2010-03-05
Name: date, dtype: datetime64[ns]


In [11]:
# Check current data types
print("\nCurrent Data Types:")
print(df.dtypes)

# Ensure 'store' and 'holiday_flag' are categorical (since they are discrete)
df['store'] = df['store'].astype('category')
df['holiday_flag'] = df['holiday_flag'].astype('category')

# Verify updated data types
print("\nUpdated Data Types:")
print(df.dtypes)


Current Data Types:
store                    int64
date            datetime64[ns]
weekly_sales           float64
holiday_flag             int64
temperature            float64
fuel_price             float64
cpi                    float64
unemployment           float64
dtype: object

Updated Data Types:
store                 category
date            datetime64[ns]
weekly_sales           float64
holiday_flag          category
temperature            float64
fuel_price             float64
cpi                    float64
unemployment           float64
dtype: object


In [13]:
# Summary statistics for numerical columns
print("\nSummary Statistics for Numerical Columns:")
print(df.describe())

# Check unique values for categorical columns
print("\nUnique Values in 'store':", df['store'].nunique())
print("Unique Values in 'holiday_flag':", df['holiday_flag'].unique())


Summary Statistics for Numerical Columns:
       weekly_sales  temperature   fuel_price          cpi  unemployment
count  6.435000e+03  6435.000000  6435.000000  6435.000000   6435.000000
mean   1.046965e+06    60.663782     3.358607   171.578394      7.999151
std    5.643666e+05    18.444933     0.459020    39.356712      1.875885
min    2.099862e+05    -2.060000     2.472000   126.064000      3.879000
25%    5.533501e+05    47.460000     2.933000   131.735000      6.891000
50%    9.607460e+05    62.670000     3.445000   182.616521      7.874000
75%    1.420159e+06    74.940000     3.735000   212.743293      8.622000
max    3.818686e+06   100.140000     4.468000   227.232807     14.313000

Unique Values in 'store': 45
Unique Values in 'holiday_flag': [0, 1]
Categories (2, int64): [0, 1]


In [15]:
# Save the cleaned dataset
df.to_csv('Walmart_Sales_Cleaned.csv', index=False)
print("\nCleaned dataset saved as 'Walmart_Sales_Cleaned.csv'")


Cleaned dataset saved as 'Walmart_Sales_Cleaned.csv'
