In [1]:
# Step 1: Import libraries
import pandas as pd

# Step 2: Load the dataset
df = pd.read_csv("online_shoppers_intention.csv")

# Step 3: Show the first 5 rows
df.head()


# Check the shape and data types
print("Shape of dataset:", df.shape)
print("\nData types:\n", df.dtypes)

# Check for missing values
print("\nMissing values:\n", df.isnull().sum())


from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1. Encode categorical variables
df_encoded = df.copy()

# Encode 'Month' and 'VisitorType'
le_month = LabelEncoder()
le_visitor = LabelEncoder()

df_encoded['Month'] = le_month.fit_transform(df_encoded['Month'])
df_encoded['VisitorType'] = le_visitor.fit_transform(df_encoded['VisitorType'])
df_encoded['Weekend'] = df_encoded['Weekend'].astype(int)  # Convert boolean to 0/1
df_encoded['Revenue'] = df_encoded['Revenue'].astype(int)  # Also make this int for classification later

# 2. Scale the numeric features
features_to_scale = [
    'Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates',
    'PageValues', 'SpecialDay'
]

scaler = StandardScaler()
df_encoded[features_to_scale] = scaler.fit_transform(df_encoded[features_to_scale])

# Final preview
df_encoded.head()



from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Select behavior features only
cluster_features = df_encoded[[
    'Administrative', 'Administrative_Duration',
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates',
    'PageValues', 'SpecialDay'
]]



# Elbow method to find optimal k
inertia = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(cluster_features)
    inertia.append(kmeans.inertia_)

# Plot the elbow
plt.figure(figsize=(8, 5))
plt.plot(K_range, inertia, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.grid(True)
plt.show()


# Apply KMeans with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
df_encoded['ShopperType'] = kmeans.fit_predict(cluster_features)

# Preview the new column
df_encoded[['ShopperType']].value_counts().sort_index()


# Cluster profiles based on behavior
cluster_profile = df_encoded.groupby('ShopperType')[[
    'Administrative', 'Administrative_Duration',
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates',
    'PageValues', 'SpecialDay'
]].mean()

cluster_profile


# Map cluster numbers to human-readable shopper types
shopper_labels = {
    0: 'Bargain Hunter',
    1: 'Planned Spender',
    2: 'Impulse Buyer'
}

df_encoded['ShopperProfile'] = df_encoded['ShopperType'].map(shopper_labels)

# Preview
df_encoded[['ShopperType', 'ShopperProfile']].head()



# Convert product interaction counts to categories
def product_category_group(val):
    if val <= -0.5:
        return 'Low'
    elif val <= 0.5:
        return 'Medium'
    else:
        return 'High'

df_encoded['ProductCategoryLevel'] = df_encoded['ProductRelated'].apply(product_category_group)

# Check balance
df_encoded['ProductCategoryLevel'].value_counts()


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# 🎯 Target
target = df_encoded['ProductCategoryLevel']

# 🧠 Features - drop clustering/target columns
features = df_encoded.drop(columns=[
    'ShopperType', 'ShopperProfile', 'ProductCategoryLevel', 'ProductRelated'
])

# 🧪 Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42, stratify=target
)

# 🌲 Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 📊 Predict
y_pred = rf.predict(X_test)

# 📈 Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




import matplotlib.pyplot as plt

df_encoded['ShopperProfile'].value_counts().plot.pie(
    autopct='%1.1f%%',
    colors=['gold', 'lightgreen', 'lightcoral'],
    title='Shopper Types Distribution',
    ylabel=''
)
plt.show()



NameError: name 'df_encoded' is not defined