In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the data
data_path = 'activity_data.csv'
data = pd.read_csv(data_path)

# Preprocess the data
def preprocess_data(data):
    # Convert categorical columns to numeric
    label_encoders = {}
    for column in ['Month', 'VisitorType']:
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])
    
    # Convert boolean columns to numeric
    data['Weekend'] = data['Weekend'].astype(int)
    data['Revenue'] = data['Revenue'].astype(int)
    
    # Split data into features and target
    X = data.drop('Revenue', axis=1)
    y = data['Revenue']
    
    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X, y

X, y = preprocess_data(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression model
logistic_model = LogisticRegression(max_iter=3000)  # Increased max_iter
logistic_model.fit(X_train, y_train)

# Make predictions
logistic_predictions = logistic_model.predict(X_test)


In [3]:
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

# Make predictions
tree_predictions = tree_model.predict(X_test)

In [4]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model
forest_model = RandomForestClassifier(random_state=42)
forest_model.fit(X_train, y_train)

# Make predictions
forest_predictions = forest_model.predict(X_test)

In [5]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define an evaluation function
def evaluate_model(y_true, y_pred, model_name):
    print(f"Evaluation of {model_name}:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("\n")

# Evaluate Logistic Regression model
evaluate_model(y_test, logistic_predictions, "Logistic Regression")

# Evaluate Decision Tree model
evaluate_model(y_test, tree_predictions, "Decision Tree")

# Evaluate Random Forest model
evaluate_model(y_test, forest_predictions, "Random Forest")

Evaluation of Logistic Regression:
Accuracy: 0.8690186536901865
Confusion Matrix:
 [[2002   53]
 [ 270  141]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.93      2055
           1       0.73      0.34      0.47       411

    accuracy                           0.87      2466
   macro avg       0.80      0.66      0.70      2466
weighted avg       0.86      0.87      0.85      2466



Evaluation of Decision Tree:
Accuracy: 0.8572587185725872
Confusion Matrix:
 [[1879  176]
 [ 176  235]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.91      0.91      2055
           1       0.57      0.57      0.57       411

    accuracy                           0.86      2466
   macro avg       0.74      0.74      0.74      2466
weighted avg       0.86      0.86      0.86      2466



Evaluation of Random Forest:
Accuracy: 0.8957826439578265
Confusion Matrix:
 [[1

In [6]:
# import matplotlib.pyplot as plt
# import numpy as np
# # Feature importance for Random Forest
# importances = best_forest.feature_importances_
# indices = np.argsort(importances)[::-1]
# features = X.columns

# # Plot feature importances
# plt.figure(figsize=(10, 6))
# plt.title("Feature Importances")
# plt.bar(range(X.shape[1]), importances[indices], align="center")
# plt.xticks(range(X.shape[1]), features[indices], rotation=90)
# plt.show()

In [7]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder, StandardScaler

# # Load the data
# data_path = 'activity_data.csv'
# data = pd.read_csv(data_path)

# # Preprocess the data
# def preprocess_data(data):
#     # Convert categorical columns to numeric
#     label_encoders = {}
#     for column in ['Month', 'VisitorType']:
#         label_encoders[column] = LabelEncoder()
#         data[column] = label_encoders[column].fit_transform(data[column])
    
#     # Convert boolean columns to numeric
#     data['Weekend'] = data['Weekend'].astype(int)
#     data['Revenue'] = data['Revenue'].astype(int)
    
#     # Split data into features and target
#     X = data.drop('Revenue', axis=1)
#     y = data['Revenue']
    
#     # Scale the features
#     scaler = StandardScaler()
#     X_scaled = scaler.fit_transform(X)
    
#     return X, X_scaled, y

# X, X_scaled, y = preprocess_data(data)
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [8]:
# from sklearn.tree import DecisionTreeClassifier

# # Define parameter grid
# param_grid_tree = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 10, 20]}

# # Decision Tree model
# tree = DecisionTreeClassifier(random_state=42)

# # Hyperparameter tuning using GridSearchCV
# grid_tree = GridSearchCV(tree, param_grid_tree, cv=5, scoring='accuracy')
# grid_tree.fit(X_train, y_train)

# # Best Decision Tree model
# best_tree = grid_tree.best_estimator_

# # Make predictions
# tree_predictions = best_tree.predict(X_test)

In [9]:
# from sklearn.ensemble import RandomForestClassifier

# # Define parameter grid
# param_grid_forest = {'n_estimators': [100, 200, 300], 'max_features': ['auto', 'sqrt', 'log2']}

# # Random Forest model
# forest = RandomForestClassifier(random_state=42)

# # Hyperparameter tuning using GridSearchCV
# grid_forest = GridSearchCV(forest, param_grid_forest, cv=5, scoring='accuracy')
# grid_forest.fit(X_train, y_train)

# # Best Random Forest model
# best_forest = grid_forest.best_estimator_

# # Make predictions
# forest_predictions = best_forest.predict(X_test)

In [10]:
# from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# # Define an evaluation function
# def evaluate_model(y_true, y_pred, model_name):
#     print(f"Evaluation of {model_name}:")
#     print("Accuracy:", accuracy_score(y_true, y_pred))
#     print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
#     print("Classification Report:\n", classification_report(y_true, y_pred))
#     print("\n")

# # Evaluate Logistic Regression model
# evaluate_model(y_test, logistic_predictions, "Tuned Logistic Regression")

# # Evaluate Decision Tree model
# evaluate_model(y_test, tree_predictions, "Tuned Decision Tree")

# # Evaluate Random Forest model
# # evaluate_model(y_test, forest_predictions, "Tuned Random Forest")
