In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/model/shopping_trends_updated.csv'

# Read the CSV file
df = pd.read_csv(file_path, index_col=0)

# Print the columns and a few rows to verify the data before mapping
print("Columns in the DataFrame:", df.columns)
print("Sample data before mapping:\n", df.head())


Mounted at /content/drive
Columns in the DataFrame: Index(['Age', 'Gender', 'Item Purchased', 'Category', 'Purchase Amount (USD)',
       'Location', 'Size', 'Color', 'Season', 'Review Rating',
       'Subscription Status', 'Shipping Type', 'Discount Applied',
       'Promo Code Used', 'Previous Purchases', 'Payment Method',
       'Frequency of Purchases'],
      dtype='object')
Sample data before mapping:
              Age Gender Item Purchased  Category  Purchase Amount (USD)  \
Customer ID                                                               
1             55   Male         Blouse  Clothing                     53   
2             19   Male        Sweater  Clothing                     64   
3             50   Male          Jeans  Clothing                     73   
4             21   Male        Sandals  Footwear                     90   
5             45   Male         Blouse  Clothing                     49   

                  Location Size      Color  Season  Review Rat

In [None]:
# data cleaning
mappings = {
    'Category': {'Clothing': 1, 'Footwear': 2, 'Outerwear': 3, 'Accessories': 4},
    'Size': {'S': 0, 'M': 1, 'L': 2, 'XL': 3},
    'Season': {'Spring': 1, 'Summer': 2, 'Fall': 3, 'Winter': 4},
    'Subscription Status': {'Yes': 1, 'No': 0},
    'Discount Applied': {'Yes': 1, 'No': 0},
    'Promo Code Used': {'Yes': 1, 'No': 0},
    'Gender': {'Male': 1, 'Female': 0},
    'Frequency of Purchases': {'Fortnightly': 0.5, 'Weekly': 1, 'Annually': 1/52, 'Quarterly': 1/13, 'Bi-Weekly': 0.5, 'Monthly': 1/4, 'Every 3 Months': 1/13},
    'Shipping Type': {'Express': 1, 'Free Shipping': 2, 'Next Day Air': 2, 'Standard': 4, '2-Day Shipping': 5, 'Store Pickup': 6},
    'Color': {'Gray': 1, 'Turquoise': 1, 'White': 1, 'Silver': 1, 'Pink': 1, 'Lavender': 1, 'Cyan': 1, 'Beige': 1, 'Yellow': 1, 'Magenta': 1,'Maroon': 2, 'Charcoal': 2, 'Purple': 2, 'Olive': 2, 'Gold': 2, 'Violet': 2, 'Teal': 2, 'Black': 2, 'Green': 2, 'Peach': 2, 'Red': 2, 'Brown': 2, 'Orange': 2, 'Indigo': 2, 'Blue': 2}
}

# Apply the mappings to the DataFrame
for column, mapping in mappings.items():
    if column in df.columns:
        print(f"Applying mapping to column: {column}")
        df[column] = df[column].map(mapping)

# Print the DataFrame after mapping to verify
print("Sample data after mapping:\n", df.head())

# Bin the 'Age' column
age_bins = [0, 35, 60, df['Age'].max()]
age_labels = [0, 1, 2]
df['Age Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, include_lowest=True)

# Bin the 'Review Rating' column
rating_bins = [0, 3.5, 4.5, df['Review Rating'].max()]
rating_labels = [3, 4, 5]
df['Rating Group'] = pd.cut(df['Review Rating'], bins=rating_bins, labels=rating_labels, include_lowest=True)

# Print the first few rows of the DataFrame to verify changes
print(df.head())

# Check for any missing values after mapping and binning
print(df.isnull().sum())


Applying mapping to column: Category
Applying mapping to column: Size
Applying mapping to column: Season
Applying mapping to column: Subscription Status
Applying mapping to column: Discount Applied
Applying mapping to column: Promo Code Used
Applying mapping to column: Gender
Applying mapping to column: Frequency of Purchases
Applying mapping to column: Shipping Type
Applying mapping to column: Color
Sample data after mapping:
              Age  Gender Item Purchased  Category  Purchase Amount (USD)  \
Customer ID                                                                
1             55       1         Blouse         1                     53   
2             19       1        Sweater         1                     64   
3             50       1          Jeans         1                     73   
4             21       1        Sandals         2                     90   
5             45       1         Blouse         1                     49   

                  Location  Size  C

In [None]:
from sklearn.model_selection import train_test_split

# Define feature variables (X) and target variable (y)
X = df[['Season', 'Size', 'Gender', 'Age Group', 'Frequency of Purchases', 'Promo Code Used', 'Rating Group', 'Purchase Amount (USD)','Subscription Status','Shipping Type']]
y = df['Category']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print shapes to verify the split
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("X_train sample:\n", X_train.head())

X_train shape: (2730, 10)
X_test shape: (1170, 10)
y_train shape: (2730,)
y_test shape: (1170,)
X_train sample:
              Season  Size  Gender Age Group  Frequency of Purchases  \
Customer ID                                                           
1135              2     1       1         2                0.076923   
1152              1     1       1         1                1.000000   
2407              4     1       1         1                1.000000   
1054              1     1       1         1                0.500000   
3240              3     1       0         0                0.250000   

             Promo Code Used Rating Group  Purchase Amount (USD)  \
Customer ID                                                        
1135                       1            5                     81   
1152                       1            4                     86   
2407                       0            5                     57   
1054                       1            3        

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.4478632478632479
Classification Report:
              precision    recall  f1-score   support

           1       0.45      1.00      0.62       524
           2       0.00      0.00      0.00       185
           3       0.00      0.00      0.00        94
           4       0.00      0.00      0.00       367

    accuracy                           0.45      1170
   macro avg       0.11      0.25      0.15      1170
weighted avg       0.20      0.45      0.28      1170

Confusion Matrix:
[[524   0   0   0]
 [185   0   0   0]
 [ 94   0   0   0]
 [367   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
