In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Data/realistic_linear_regression_dataset.csv")
df.head()

Unnamed: 0,ProductCategory,Region,CustomerSegment,IsPromotionApplied,ProductionCost,MarketingSpend,SeasonalDemandIndex,CompetitorPrice,CustomerRating,EconomicIndex,StoreCount,SalesRevenue
0,Furniture,East,High Income,Yes,536.051521,189.277811,1.159611,220.831351,4.035607,146.225757,52,2293.143707
1,Toys,West,High Income,No,352.701361,255.921497,1.545366,467.217175,4.106804,104.261304,35,1640.454368
2,Electronics,South,High Income,No,618.989105,277.399353,1.671902,363.623261,4.021775,77.220752,44,2173.086023
3,Furniture,West,Middle Income,Yes,339.959644,153.557699,1.408244,209.853621,4.14889,128.277455,15,1672.608857
4,Furniture,West,Middle Income,Yes,477.951385,155.814478,2.177301,274.85995,4.913782,111.309643,61,2443.222482


In [3]:
print(df.shape)

(2000, 12)


In [4]:
print(df.columns)

Index(['ProductCategory', 'Region', 'CustomerSegment', 'IsPromotionApplied',
       'ProductionCost', 'MarketingSpend', 'SeasonalDemandIndex',
       'CompetitorPrice', 'CustomerRating', 'EconomicIndex', 'StoreCount',
       'SalesRevenue'],
      dtype='object')


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ProductCategory      2000 non-null   object 
 1   Region               2000 non-null   object 
 2   CustomerSegment      2000 non-null   object 
 3   IsPromotionApplied   2000 non-null   object 
 4   ProductionCost       2000 non-null   float64
 5   MarketingSpend       2000 non-null   float64
 6   SeasonalDemandIndex  2000 non-null   float64
 7   CompetitorPrice      2000 non-null   float64
 8   CustomerRating       2000 non-null   float64
 9   EconomicIndex        2000 non-null   float64
 10  StoreCount           2000 non-null   int64  
 11  SalesRevenue         2000 non-null   float64
dtypes: float64(7), int64(1), object(4)
memory usage: 187.6+ KB
None


In [6]:
print(df.isnull().sum())

ProductCategory        0
Region                 0
CustomerSegment        0
IsPromotionApplied     0
ProductionCost         0
MarketingSpend         0
SeasonalDemandIndex    0
CompetitorPrice        0
CustomerRating         0
EconomicIndex          0
StoreCount             0
SalesRevenue           0
dtype: int64


In [7]:
# data overview
#This dataset contains historical business performance data across product categories, regions, and customer segments.
#It includes variables related to marketing spend, promotions, production cost, competitive pricing, seasonality, customer ratings, and economic conditions. 
#The data is used to analyze how these factors influence sales revenue and support data-driven business decisions.

In [8]:
# column-level-interpretation
# Column Name          Data Type     Business Meaning                              Analytical Role                                  
 
# ProductCategory      Categorical   Category of the product being sold            Captures demand differences across product types 
# Region               Categorical   Geographic region of sales                    Accounts for regional market variation           
# CustomerSegment      Categorical   Type of customer (e.g., consumer, corporate)  Reflects differences in purchasing behavior      
# IsPromotionApplied   Binary (0/1)  Indicates whether a promotion was active      Measures promotional impact on revenue           
# ProductionCost       Numeric       Cost incurred to produce the product          Influences pricing and profit margins            
# MarketingSpend       Numeric       Amount spent on marketing activities          Key investment driver for revenue                
# SeasonalDemandIndex  Numeric       Index representing seasonal demand strength   Captures cyclic sales patterns                   
# CompetitorPrice      Numeric       Average competitor pricing                    Represents external pricing pressure             
# CustomerRating       Numeric       Average customer satisfaction rating          Proxy for perceived product quality              
# EconomicIndex        Numeric       Indicator of overall economic conditions      Captures macro-economic effects                  
# StoreCount           Numeric       Number of stores selling the product          Reflects distribution scale                     
# SalesRevenue         Numeric       Total revenue generated                       Target variable                        


In [9]:
# target variable identification
# The target variable for this analysis is SalesRevenue, which represents the total revenue generated from sales under specific business and market conditions.

# SalesRevenue is a continuous numerical variable, making it suitable for regression-based modeling techniques. 
# From a business perspective, it serves as a primary performance indicator used to evaluate the effectiveness of marketing investments, promotional strategies, pricing competition, and operational scale.

# All remaining variables in the dataset are treated as explanatory features that potentially influence sales revenue. 
# The objective of the modeling process is to quantify the direction and magnitude of these influences while maintaining interpretability for business decision-making.

In [10]:
# data types and format validation
# The dataset primarily consists of numeric and categorical variables that align with their intended business meaning. The target variable, SalesRevenue, is stored as a numeric type, confirming its suitability for regression analysis.

# Categorical variables such as product category, region, and customer segment are currently stored as object types and will require appropriate encoding during the preprocessing stage.

# The binary variable IsPromotionApplied was validated to ensure consistent representation. No critical data type inconsistencies were observed at this stage.

In [11]:
# No missing values were observed across the dataset, indicating a high level of data completeness suitable for regression analysis.

In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ProductionCost,2000.0,499.969606,98.840116,129.120339,434.452593,500.600031,566.588504,865.77018
MarketingSpend,2000.0,201.27342,49.95376,46.777157,167.001091,201.052487,235.146602,379.973339
SeasonalDemandIndex,2000.0,1.484039,0.503743,-0.527025,1.15043,1.485411,1.826129,3.046715
CompetitorPrice,2000.0,299.586427,81.524787,46.020085,241.267538,301.346897,355.093381,552.44303
CustomerRating,2000.0,3.999551,0.497141,2.449741,3.656696,3.985916,4.338487,5.550593
EconomicIndex,2000.0,100.220798,19.176983,36.004789,87.297706,99.951705,112.597996,164.320403
StoreCount,2000.0,48.6745,28.940548,1.0,23.0,49.0,74.0,99.0
SalesRevenue,2000.0,2072.85745,346.007903,1094.518587,1808.106571,2068.173855,2341.223175,3115.114292


In [13]:
invalid_checks = {
    "Negative SalesRevenue": (df["SalesRevenue"] < 0).sum(),
    "Negative MarketingSpend": (df["MarketingSpend"] < 0).sum(),
    "Negative ProductionCost": (df["ProductionCost"] < 0).sum(),
    "Zero StoreCount": (df["StoreCount"] <= 0).sum()
}

invalid_checks


{'Negative SalesRevenue': 0,
 'Negative MarketingSpend': 0,
 'Negative ProductionCost': 0,
 'Zero StoreCount': 0}

In [14]:
# Customer rating should usually fall within a known scale (e.g., 1â€“5)
df["CustomerRating"].describe()


count    2000.000000
mean        3.999551
std         0.497141
min         2.449741
25%         3.656696
50%         3.985916
75%         4.338487
max         5.550593
Name: CustomerRating, dtype: float64

In [15]:
# Seasonal demand index should not be negative
(df["SeasonalDemandIndex"] < 0).sum()

3

In [16]:
# During the range validation process, a small number of records (3 observations) were found to contain negative values in the SeasonalDemandIndex variable. Since seasonal demand indices are expected to represent relative demand intensity, negative values are not logically valid.

# These anomalies are likely the result of data generation noise or data entry inconsistencies. No corrective action is taken at this stage, and the handling of these records will be addressed during the data preprocessing phase.

In [17]:
df["IsPromotionApplied"].value_counts(dropna=False)


IsPromotionApplied
No     1028
Yes     972
Name: count, dtype: int64

In [18]:
# Data Granularity
# Each row in the dataset represents a single business observation corresponding to a specific combination of product category, region, and customer segment under given market conditions. The values reflect aggregated sales performance influenced by promotional activity, marketing investment, seasonal demand, competitive pricing, and broader economic factors.

# Assumptions

# The recorded values accurately represent historical business conditions at the time of observation.

# The relationship between explanatory variables and sales revenue is assumed to be approximately linear for interpretability purposes.

# External factors not captured in the dataset (such as sudden market disruptions or policy changes) are assumed to have a limited impact on the observed sales revenue.

# Constraints and Limitations

# The dataset does not include direct pricing or profit margin information, limiting margin-level analysis.

# Customer behavior is represented indirectly through aggregated features rather than individual-level data.

# The analysis focuses on association and explanation rather than causal inference.