In [None]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "# Load data\n",
    "df = pd.read_csv('../data/dynamic_pricing_dataset.csv')\n",
    "\n",
    "# Display basic information\n",
    "print(\"Dataset Info:\")\n",
    "df.info()\n",
    "\n",
    "# Check for missing values\n",
    "print(\"\\nMissing Values:\")\n",
    "print(df.isnull().sum())\n",
    "\n",
    "# Display summary statistics\n",
    "print(\"\\nSummary Statistics:\")\n",
    "print(df.describe())\n",
    "\n",
    "# Display categorical columns info\n",
    "categorical_cols = ['Vehicle_Type', 'Safety_Feature', 'Weather_Condition', 'Policy_Type']\n",
    "print(\"\\nCategorical Columns Value Counts:\")\n",
    "for col in categorical_cols:\n",
    "    print(f\"\\n{col}:\")\n",
    "    print(df[col].value_counts())\n",
    "\n",
    "# Correlation heatmap for numerical columns\n",
    "numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns\n",
    "plt.figure(figsize=(12, 8))\n",
    "sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')\n",
    "plt.title('Correlation Heatmap - Numerical Features')\n",
    "plt.show()\n",
    "\n",
    "# Distribution of target variable\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.histplot(df['Dynamic_Premium'])\n",
    "plt.title('Distribution of Dynamic Premium')\n",
    "plt.show()\n",
    "\n",
    "# Box plots for categorical variables\n",
    "for col in categorical_cols:\n",
    "    plt.figure(figsize=(12, 6))\n",
    "    sns.boxplot(x=col, y='Dynamic_Premium', data=df)\n",
    "    plt.xticks(rotation=45)\n",
    "    plt.title(f'Dynamic Premium by {col}')\n",
    "    plt.show()\n",
    "\n",
    "# Preprocessing Steps\n",
    "print(\"\\nPreprocessing Steps:\")\n",
    "\n",
    "# 1. Handle categorical variables\n",
    "print(\"\\n1. Categorical Variables Encoding:\")\n",
    "df_processed = df.copy()\n",
    "\n",
    "# Option 1: Label Encoding\n",
    "le = LabelEncoder()\n",
    "df_label_encoded = df_processed.copy()\n",
    "for col in categorical_cols:\n",
    "    df_label_encoded[col] = le.fit_transform(df_label_encoded[col])\n",
    "print(\"\\nLabel Encoded Data Sample:\")\n",
    "print(df_label_encoded[categorical_cols].head())\n",
    "\n",
    "# Option 2: One-Hot Encoding\n",
    "df_onehot = pd.get_dummies(df_processed, columns=categorical_cols)\n",
    "print(\"\\nOne-Hot Encoded Features:\")\n",
    "print(f\"Original features: {df.shape[1]}\")\n",
    "print(f\"After one-hot encoding: {df_onehot.shape[1]}\")\n",
    "\n",
    "# 2. Check for outliers in numerical columns\n",
    "print(\"\\n2. Outlier Analysis:\")\n",
    "numerical_cols = ['Speed_km', 'Distance_Location', 'Vehicle_Year']\n",
    "for col in numerical_cols:\n",
    "    Q1 = df[col].quantile(0.25)\n",
    "    Q3 = df[col].quantile(0.75)\n",
    "    IQR = Q3 - Q1\n",
    "    outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]\n",
    "    print(f\"\\n{col} outliers: {len(outliers)}\")\n",
    "\n",
    "# 3. Feature scaling analysis\n",
    "print(\"\\n3. Feature Scaling Analysis:\")\n",
    "print(\"Numerical features statistics:\")\n",
    "print(df[numerical_cols].describe())\n",
    "\n",
    "# 4. Feature importance with correlation to target\n",
    "print(\"\\n4. Correlation with Target Variable:\")\n",
    "correlations = df_label_encoded.corr()['Dynamic_Premium'].sort_values(ascending=False)\n",
    "print(correlations)\n",
    "\n",
    "# Save preprocessed data\n",
    "df_onehot.to_csv('../data/preprocessed_data.csv', index=False)\n",
    "print(\"\\nPreprocessed data saved to '../data/preprocessed_data.csv'\")"
   ]
  }
 ]
}