# Online Shoppers Intention Analysis

## Project Overview
This project analyzes the Online Shoppers Purchasing Intention Dataset to predict whether a visitor will make a purchase ('Revenue').

**Goals:**
1. Data Preprocessing (Cleaning, Encoding, Scaling)
2. Exploratory Data Analysis (EDA)
3. Model Building (Logistic Regression, Random Forest)
4. Model Evaluation & Comparison

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Settings
pd.set_option('display.max_columns', None)

## 1. Data Loading and Cleaning

In [None]:
# Load Data
df = pd.read_csv('online_shoppers_intention.csv')
print(f"Initial Shape: {df.shape}")

# Check for duplicates and missing values
print(f"Duplicates: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
print(f"Shape after cleaning: {df.shape}")

## 2. Exploratory Data Analysis (EDA)
Visualizing key feature distributions and correlations.

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 10))
numeric_df = df.select_dtypes(include=['float64', 'int64'])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

# Pattern: PageValues vs Revenue
plt.figure(figsize=(8, 6))
sns.boxplot(x='Revenue', y='PageValues', data=df)
plt.title('PageValues vs Revenue')
plt.show()

## 3. Data Preprocessing
Encoding categorical variables and scaling numerical features.

In [None]:
# Encoding
df['Weekend'] = df['Weekend'].astype(int)
df['Revenue'] = df['Revenue'].astype(int)
df = pd.get_dummies(df, columns=['Month', 'VisitorType'], drop_first=True)

# Scaling
numerical_cols = ['Administrative', 'Administrative_Duration', 'Informational', 
                  'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 
                  'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Splitting
X = df.drop('Revenue', axis=1)
y = df['Revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Set: {X_train.shape}")
print(f"Testing Set: {X_test.shape}")

## 4. Model Building and Evaluation
comparing Logistic Regression and Random Forest.

In [None]:
def evaluate_model(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {name}')
    plt.show()

# Logistic Regression
evaluate_model(LogisticRegression(max_iter=1000, random_state=42), "Logistic Regression")

# Random Forest
evaluate_model(RandomForestClassifier(n_estimators=100, random_state=42), "Random Forest")