In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import re

In [14]:
file = "data/raw_logs/final_server_logs11.csv"
df = pd.read_csv(file,on_bad_lines='skip')

In [15]:
# Impute missing values
for column in df.columns:
    if df[column].dtype == 'object':  # Categorical columns (object type)
        df[column] = df[column].fillna('None')  # Impute with None
    else:  # Numerical columns
        df[column] = df[column].fillna(0)

In [16]:
# Fixing outliers in the Response Time column
df['Response Time (ms)'] = pd.to_numeric(df['Response Time (ms)'], errors='coerce')
df['Response Time (ms)'] = df['Response Time (ms)'].fillna(df['Response Time (ms)'].median())

Q1 = df['Response Time (ms)'].quantile(0.25)
Q3 = df['Response Time (ms)'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Calculate 95th percentile as cap value
cap_value = df['Response Time (ms)'].quantile(0.95)

# Replace outliers
df['Response Time (ms)'] = df['Response Time (ms)'].apply(
    lambda x: cap_value if x > upper_bound else (Q1 if x < lower_bound else x)
)

In [17]:
# Break up timestamp
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['hour'] = df['Timestamp'].dt.hour
df['day_of_week'] = df['Timestamp'].dt.day_of_week
df['month'] = df['Timestamp'].dt.month
df['year'] = df['Timestamp'].dt.year
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

In [18]:
# Isolate referrer names
df['Referrer']= df['Referrer'].str.extract(r"https?://www\.([a-z]+)\.com", expand=False).fillna("direct")

In [19]:
# Rename Price to revenue
df["Revenue"]=df["Price"]


In [20]:
sales_cols = ['Sales Agent', 'Product']
for col in sales_cols:
    if df[col].isna().any():
        df[col]=df[col].fillna("None")

df['Revenue']=df['Revenue'].fillna(0)

In [21]:
def categorize_url(method, path):
    """Categorizes URLs based on method and path patterns"""
    path = str(path).lower()  # Ensure string and case-insensitive
    
    # Product Pages
    if '/product/' in path:
        if 'schedule-demo' in path:
            return "Demo Request"
        elif 'request.php' in path:
            return "Product Purchase"
        elif 'feedback.php' in path:
            return "Product Feedback"
        else:
            return "Product View"
    
    # Sales Actions
    elif any(p in path for p in ['/buy-', '/checkout', '/request-quote']):
        return "Sales Conversion"
    
    # Marketing
    elif any(p in path for p in ['/promo-', '/special-offers', '/newsletter']):
        return "Marketing Content"
    
    # Support
    elif any(p in path for p in ['/customer-support', '/faq', '/bug-tickets']):
        return "Support"
    
    # Company Info
    elif any(p in path for p in ['/about-us', '/contact-sales']):
        return "Company Info"
    
    # Static Assets
    elif any(ext in path for ext in ['.jpg', '.png', '.css', '.js', '/images/']):
        return "Static Asset"
    
    # Homepage
    elif path in ["/", "/index.html", "/home"]:
        return "Homepage"
    
    else:
        return "Other"

# Apply to DataFrame (assuming df has 'Method' and 'URL' columns)
df['Request Type'] = df.apply(
    lambda row: categorize_url(row['Method'], row['URL']), 
    axis=1
)

In [22]:
# Get unique IP addresses and their count
unique_ips = df["IP Address"].nunique()

# Count visits per IP (this won't count duplicates anymore)
ip_counts = df["IP Address"].value_counts()

# Map visit counts to the original DataFrame
df["visit_count"] = df["IP Address"].map(ip_counts)

# Classify visitor type (New or Returning)
df["visitor_type"] = df["visit_count"].apply(lambda x: "New" if x == 1 else "Returning")

# Count new and returning visitors
new_visitors = df[df["visitor_type"] == "New"]["IP Address"].nunique()
returning_visitors = df[df["visitor_type"] == "Returning"]["IP Address"].nunique()

# Print results
print("Total Unique IPs (counted only once):", unique_ips)
print("New Visitors:", new_visitors)
print("Returning Visitors:", returning_visitors)
print("Total Visits (All IPs):", ip_counts.sum())


Total Unique IPs (counted only once): 39104
New Visitors: 36649
Returning Visitors: 2455
Total Visits (All IPs): 41666


In [23]:
df.columns

Index(['Timestamp', 'IP Address', 'Session ID', 'Country', 'Method', 'URL',
       'Status Code', 'Response Time (ms)', 'Sales Agent', 'Referrer',
       'Product', 'Price', 'IP_Session', 'viewed_pricing_after_demo',
       'pages_after_demo', 'sessions_after_demo', 'time_to_purchase', 'hour',
       'day_of_week', 'month', 'year', 'is_weekend', 'Revenue', 'Request Type',
       'visit_count', 'visitor_type'],
      dtype='object')

In [24]:
file='data/cleaned_logs/cleaned10.csv'
df.to_csv(file, index=False)

print(f"CSV file {file} has been saved successfully.")

CSV file data/cleaned_logs/cleaned10.csv has been saved successfully.
