In [12]:
# Cell 1 – Load & Clean
import pandas as pd
import numpy as np

# 1. Load the data
df = pd.read_csv('Shoplytics_Nexus_Data.csv', encoding='unicode_escape')

# 2. Drop identifier/garbage columns
cols_to_drop = ['User_ID', 'Cust_name', 'Product_ID', 'unnamed1', 'Status']
df = df.drop(columns=cols_to_drop, errors='ignore')

# 3. Remove rows where the target is missing
df = df.dropna(subset=['Amount']).reset_index(drop=True)

# Quick check
print("Shape after cleaning:", df.shape)
df.head()

Shape after cleaning: (11239, 10)


Unnamed: 0,Gender,Age Group,Age,Marital_Status,State,Zone,Occupation,Product_Category,Orders,Amount
0,F,26-35,28,0,Maharashtra,Western,Healthcare,Auto,1,23952.0
1,F,26-35,35,1,Andhra Pradesh,Southern,Govt,Auto,3,23934.0
2,F,26-35,35,1,Uttar Pradesh,Central,Automobile,Auto,3,23924.0
3,M,0-17,16,0,Karnataka,Southern,Construction,Auto,2,23912.0
4,M,26-35,28,1,Gujarat,Western,Food Processing,Auto,2,23877.0


In [13]:
# Cell 2 – Define Features & Target
TARGET = 'Amount'

# 4. Split into X (features) and y (target)
X = df.drop(columns=[TARGET])
y = df[TARGET]

print("Feature columns:", X.columns.tolist())
print("Target:", TARGET)


Feature columns: ['Gender', 'Age Group', 'Age', 'Marital_Status', 'State', 'Zone', 'Occupation', 'Product_Category', 'Orders']
Target: Amount


In [14]:
# Cell 3 – Impute Missing Values (if any)
from sklearn.impute import SimpleImputer

# 5a. Numeric columns: median imputation
num_cols = X.select_dtypes(include=np.number).columns
num_imputer = SimpleImputer(strategy='median')
X[num_cols] = num_imputer.fit_transform(X[num_cols])

# 5b. Categorical columns: most-frequent imputation
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
cat_imputer = SimpleImputer(strategy='most_frequent')
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

# Verify no missing
print("Missing per column:\n", X.isnull().sum())


Missing per column:
 Gender              0
Age Group           0
Age                 0
Marital_Status      0
State               0
Zone                0
Occupation          0
Product_Category    0
Orders              0
dtype: int64


In [15]:
# Cell 4 – One-Hot Encode Categoricals
from sklearn.preprocessing import OneHotEncoder

# 6. Setup encoder (dense output)
encoder = OneHotEncoder(drop='first', sparse_output=False)

# 7. Fit+transform on categorical columns
enc_array = encoder.fit_transform(X[cat_cols])

# 8. Build a DataFrame of dummies, preserving index
enc_df = pd.DataFrame(
    enc_array,
    columns=encoder.get_feature_names_out(cat_cols),
    index=X.index
)

# 9. Drop originals and concatenate
X = pd.concat([X.drop(columns=cat_cols), enc_df], axis=1)

print("Final feature matrix shape:", X.shape)


Final feature matrix shape: (11239, 60)


In [16]:
# Cell 5 – Train/Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)


Train size: (8991, 60) Test size: (2248, 60)


In [8]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# 1. Identify categorical columns
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

# 2. One-hot encoder with new argument name
encoder = OneHotEncoder(drop='first', sparse_output=False)

# 3. Fit & transform to a NumPy array
enc_array = encoder.fit_transform(X[cat_cols])

# 4. Build a DataFrame of the new dummy columns (preserve X’s index)
enc_df = pd.DataFrame(
    enc_array,
    columns=encoder.get_feature_names_out(cat_cols),
    index=X.index
)

# 5. Drop the original text columns and concatenate
X = pd.concat([X.drop(columns=cat_cols), enc_df], axis=1)

print("New feature matrix shape:", X.shape)

New feature matrix shape: (11251, 2410)


In [17]:
# Cell 6 – Fit & Evaluate Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 10. Train
model = LinearRegression()
model.fit(X_train, y_train)

# 11. Predict
y_pred = model.predict(X_test)

# 12. Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R²:   {r2:.3f}")


Test RMSE: 3195.66
Test R²:   0.625


In [18]:
# Cell 7 – Save Your Model
import joblib

joblib.dump(model, 'linear_sales_model.joblib')
print("Model saved to linear_sales_model.joblib")


Model saved to linear_sales_model.joblib
