In [None]:
# Part 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Part 2: Load your dataset
# Replace 'your_dataset.csv' with your actual file path
df = pd.read_csv('/content/Walmart_customer_purchases_with_competitors.csv')

# Preview
print("Initial dataframe shape:", df.shape)
print(df.head())

# Part 3: Keep only relevant columns that exist
cols_needed = ["Purchase_Amount", "Payment_Method", "City"]
existing_cols = [col for col in cols_needed if col in df.columns]

if not existing_cols:
    raise ValueError("None of the required columns exist in the dataframe.")

df = df[existing_cols].copy()

# Drop rows with missing target
df = df.dropna(subset=["Purchase_Amount"])

# Optional: Fill missing categorical values
for col in ["Payment_Method", "City"]:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown")

# Part 4: One-hot encode categorical columns
cat_cols = [col for col in ["Payment_Method", "City"] if col in df.columns]
df_model = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Part 5: Define features and target
target = "Purchase_Amount"
X = df_model.drop(columns=[target])
y = df_model[target]

# Check data size
if X.shape[0] == 0:
    raise ValueError("No data available after preprocessing. Check your input dataframe.")

# Part 6: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

# Part 7: Train a simple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Part 8: Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Optional: View coefficients
coeff_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": model.coef_
}).sort_values(by="Coefficient", ascending=False)

print(coeff_df)

Initial dataframe shape: (50000, 20)
            Product_Name   Brand     Category  Market_Price  Purchase_Amount  \
0  PRD00001   Smartphone   Apple  Electronics        309.93           253.26   
1  PRD00002      T-Shirt  Adidas     Clothing         86.83            73.19   
2  PRD00003      Perfume    Dove       Beauty        150.37           125.62   
3  PRD00004   Smartwatch    Sony  Electronics        538.88           450.32   
4  PRD00005   Smartphone   Apple  Electronics        406.85           369.28   

  Discount_Applied  Rating                           Customer_ID  Age  Gender  \
0               No       1  84607c1f-910c-44d5-b89f-e1ee06dd34c0   49  Female   
1              Yes       1  f2a81712-a73e-4424-8b39-4c615a0bd4ea   36   Other   
2              Yes       1  da9be287-8b0e-4688-bccd-1a2cdd7567c6   52    Male   
3               No       2  50ec6932-3ac7-492f-9e55-4b148212f302   47  Female   
4              Yes       2  8fdc3098-fc75-4b0f-983c-d8d8168c6362   43   Other