In [None]:
import pandas as pd

# Load dataset
train_path = "D:/Fourth_project/train_data.csv"  # change path if needed
df_train = pd.read_csv(train_path)

# Check shape (rows, columns)
print("Shape of training data:", df_train.shape)

# Preview first few rows
print("\nFirst 5 rows:")
display(df_train.head())

# Column names and data types
print("\nColumn info:")
df_train.info()

# Check for missing values
print("\nMissing values per column:")
print(df_train.isnull().sum())

# Unique values for each column (good for categorical features)
print("\nUnique values per column:")
for col in df_train.columns:
    print(f"{col}: {df_train[col].nunique()} unique values")

# Basic statistics for numerical columns
print("\nNumerical column statistics:")
display(df_train.describe())

# Value counts for potential target variable(s)
# Uncomment and replace 'target_column' with your actual classification target
# print("\nTarget distribution:")
# print(df_train['target_column'].value_counts())


In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Optional: display settings for pandas
pd.set_option('display.max_columns', None)

# Step 2: Load Data
train_path = "D:/Fourth_project/train_data.csv"  # Change if your file is in another folder
test_path = "D:/Fourth_project/test_data.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)

# Quick preview
display(df_train.head())


In [None]:
# Step 3: Basic Inspection

print("\n=== Data Info ===")
df_train.info()

print("\n=== Missing Values ===")
print(df_train.isnull().sum())

print("\n=== Unique Values per Column ===")
for col in df_train.columns:
    print(f"{col}: {df_train[col].nunique()} unique values")

print("\n=== Summary Statistics for Numeric Columns ===")
display(df_train.describe())


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

# Read the train and test data
train_df = pd.read_csv("D:/Fourth_project/train_data.csv")
test_df = pd.read_csv("D:/Fourth_project/test_data.csv")

# Features and target
X_train = train_df.drop("page", axis=1)  # assuming 'page' is the target
y_train = train_df["page"]

X_test = test_df.drop("page", axis=1)
y_test = test_df["page"]  # Only if labels are available for test


In [None]:
# Click count per session
train_df['click_count'] = train_df.groupby('session_id')['session_id'].transform('count')
test_df['click_count'] = test_df.groupby('session_id')['session_id'].transform('count')

# Bounce indicator (1 if only one click in session)
train_df['is_bounce'] = (train_df['click_count'] == 1).astype(int)
test_df['is_bounce'] = (test_df['click_count'] == 1).astype(int)

# Last page in session
train_df['is_last_page'] = (train_df['order'] == train_df.groupby('session_id')['order'].transform('max')).astype(int)
test_df['is_last_page'] = (test_df['order'] == test_df.groupby('session_id')['order'].transform('max')).astype(int)

# Merge feature engineering back into X
X_train = X_train.assign(
    click_count=train_df['click_count'],
    is_bounce=train_df['is_bounce'],
    is_last_page=train_df['is_last_page']
)

X_test = X_test.assign(
    click_count=test_df['click_count'],
    is_bounce=test_df['is_bounce'],
    is_last_page=test_df['is_last_page']
)


In [None]:
# 4. Define columns for preprocessing
# =========================

# Remove leakage column 'page2_clothing_model'
categorical_cols = ["page1_main_category", "country", "colour",
                    "location", "model_photography", "price_2"]

numeric_cols = ["year", "month", "day", "order", "session_id", "price",
                "click_count", "is_bounce", "is_last_page"]

# Transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Basic info
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print(train_df.info())
print(train_df.describe())

# Target distribution
plt.figure(figsize=(6,4))
sns.countplot(x='page', data=train_df)
plt.title('Target Variable Distribution (page)')
plt.show()

# Numeric feature distributions
num_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
train_df[num_cols].hist(figsize=(15,10), bins=20)
plt.suptitle('Numeric Feature Distributions')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(train_df[num_cols + ['page']].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()


In [None]:
from imblearn.pipeline import Pipeline  # <-- use this
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

# Build pipeline with SMOTE
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# Fit model
clf.fit(X_train.drop("page2_clothing_model", axis=1), y_train)

# Predictions
y_train_pred = clf.predict(X_train.drop("page2_clothing_model", axis=1))
y_test_pred = clf.predict(X_test.drop("page2_clothing_model", axis=1))

from sklearn.metrics import accuracy_score, classification_report
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))



In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Read data
train_df = pd.read_csv("D:/Fourth_project/train_data.csv")
test_df = pd.read_csv("D:/Fourth_project/test_data.csv")

X_train = train_df.drop("page", axis=1)
y_train = train_df["page"]
X_test = test_df.drop("page", axis=1)
y_test = test_df["page"]

# Column types
numeric_cols = ["year", "month", "day", "order", "session_id", "price"]
categorical_cols = ["page1_main_category", "country", "colour",
                    "location", "model_photography", "price_2"]

# Preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# Example: Logistic Regression pipeline
lr_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=500))
])

lr_param_grid = {
    "classifier__C": [0.1, 1],  # smaller grid for speed
    "classifier__solver": ["lbfgs"]
}

lr_search = GridSearchCV(lr_pipeline, lr_param_grid, cv=2, n_jobs=-1)
lr_search.fit(X_train, y_train)

print("Best Logistic Regression params:", lr_search.best_params_)
print("Train Accuracy:", lr_search.score(X_train, y_train))
print("Test Accuracy:", lr_search.score(X_test, y_test))

# Decision Tree pipeline
dt_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier())
])

dt_param_grid = {
    "classifier__max_depth": [5, 10],  # smaller depth
    "classifier__min_samples_split": [5, 10]
}

dt_search = GridSearchCV(dt_pipeline, dt_param_grid, cv=2, n_jobs=-1)
dt_search.fit(X_train, y_train)

print("Best Decision Tree params:", dt_search.best_params_)
print("Train Accuracy:", dt_search.score(X_train, y_train))
print("Test Accuracy:", dt_search.score(X_test, y_test))

# Random Forest pipeline
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=50))  # smaller forest
])

rf_param_grid = {
    "classifier__max_depth": [5, 10],
    "classifier__min_samples_split": [5, 10]
}

rf_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=2, n_jobs=-1)
rf_search.fit(X_train, y_train)

print("Best Random Forest params:", rf_search.best_params_)
print("Train Accuracy:", rf_search.score(X_train, y_train))
print("Test Accuracy:", rf_search.score(X_test, y_test))


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Read data
train_df = pd.read_csv("D:/Fourth_project/train_data.csv")
test_df = pd.read_csv("D:/Fourth_project/test_data.csv")

X_train = train_df.drop("page", axis=1)
y_train = train_df["page"]
X_test = test_df.drop("page", axis=1)
y_test = test_df["page"]

# Column types
numeric_cols = ["year", "month", "day", "order", "session_id", "price"]
categorical_cols = ["page1_main_category", "country", "colour",
                    "location", "model_photography", "price_2"]

# Preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# Shift target labels for XGBoost
y_train_xgb = y_train - 1
y_test_xgb = y_test - 1

# XGBoost pipeline with SMOTE
xgb_pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", XGBClassifier(use_label_encoder=False, eval_metric="mlogloss"))
])

# Hyperparameter grid
param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [3, 5],
    "classifier__learning_rate": [0.1, 0.2]
}

from sklearn.model_selection import GridSearchCV
xgb_search = GridSearchCV(xgb_pipeline, param_grid, cv=2, n_jobs=-1)
xgb_search.fit(X_train, y_train_xgb)

# Predictions (shift back to original labels)
y_train_pred = xgb_search.predict(X_train) + 1
y_test_pred = xgb_search.predict(X_test) + 1

# Evaluation
print("Best XGBoost params:", xgb_search.best_params_)
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

# -----------------------------
# Load or train your model here
# For demo, we'll train a simple classifier on uploaded CSV
# -----------------------------
@st.cache_data
def train_model(df):
    X = df.drop("page", axis=1)
    y = df["page"]

    numeric_cols = ["year", "month", "day", "order", "session_id", "price"]
    categorical_cols = ["page1_main_category", "country", "colour",
                        "location", "model_photography", "price_2"]

    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_cols),
            ("cat", categorical_transformer, categorical_cols)
        ]
    )

    clf = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=500))
    ])
    clf.fit(X, y)
    return clf

# -----------------------------
# Streamlit App
# -----------------------------
st.title("E-Commerce User Behavior Predictor")

# Upload CSV
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
if uploaded_file:
    df = pd.read_csv(uploaded_file)
    st.write("Uploaded Data:")
    st.dataframe(df.head())

    if "page" in df.columns:
        model = train_model(df)
        st.success("Model trained on uploaded data!")

        # Predict using entire dataset
        if st.button("Predict on uploaded data"):
            X = df.drop("page", axis=1)
            preds = model.predict(X)
            df["Predicted Page"] = preds
            st.write(df.head())
            st.write("Prediction Distribution:")
            st.bar_chart(df["Predicted Page"].value_counts())

    # Optional Clustering
    if st.checkbox("Show K-Means Clustering (2 clusters)"):
        cluster_cols = ["order", "session_id", "price"]  # numeric columns for clustering
        kmeans = KMeans(n_clusters=2, random_state=42)
        df['Cluster'] = kmeans.fit_predict(df[cluster_cols])
        st.write(df.head())
        st.bar_chart(df['Cluster'].value_counts())

    # Visualizations
    if st.checkbox("Show Feature Visualizations"):
        num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
        st.write("Histograms for numeric features:")
        for col in num_cols:
            fig, ax = plt.subplots()
            sns.histplot(df[col], bins=20, kde=False, ax=ax)
            ax.set_title(f"Distribution of {col}")
            st.pyplot(fig)

        cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
        st.write("Bar charts for categorical features:")
        for col in cat_cols:
            fig, ax = plt.subplots()
            df[col].value_counts().plot(kind='bar', ax=ax)
            ax.set_title(f"Distribution of {col}")
            st.pyplot(fig)

# Manual Input Prediction
st.header("Manual Input Prediction")
st.write("Enter feature values to predict page:")
try:
    year = st.number_input("Year", value=2008)
    month = st.number_input("Month", value=5)
    day = st.number_input("Day", value=14)
    order = st.number_input("Order", value=1)
    session_id = st.number_input("Session ID", value=1)
    price = st.number_input("Price", value=50)
    page1_main_category = st.number_input("Page1 Main Category", value=1)
    country = st.number_input("Country", value=29)
    colour = st.number_input("Colour", value=5)
    location = st.number_input("Location", value=3)
    model_photography = st.number_input("Model Photography", value=1)
    price_2 = st.number_input("Price 2", value=1)

    input_df = pd.DataFrame([{
        "year": year,
        "month": month,
        "day": day,
        "order": order,
        "session_id": session_id,
        "price": price,
        "page1_main_category": page1_main_category,
        "country": country,
        "colour": colour,
        "location": location,
        "model_photography": model_photography,
        "price_2": price_2
    }])

    if st.button("Predict Page for Manual Input"):
        if uploaded_file:
            pred = model.predict(input_df)
            st.success(f"Predicted Page: {pred[0]}")
        else:
            st.warning("Upload CSV first to train the model!")

except Exception as e:
    st.error(f"Error in manual input: {e}")
