In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import plotly.express as px
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

# Load the dataset
file_path = "ar41_for_ulb_mini.csv"  
df = pd.read_csv(file_path, sep=';')


In [None]:
#Exploratory Data Analysis (EDA)
# Display the first few rows of the dataset
print("Dataset Overview:")
print(df.head())

# Display basic information about the dataset
print("\nDataset Information:")
print(df.info())

# Summary statistics for numerical columns
print("\nSummary Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Explore unique values in categorical columns
print("\nUnique Values:")
for column in df.select_dtypes(include='object').columns:
    print(f"{column}: {df[column].unique()}")


In [None]:
#Visualization - Distribution and Correlation
# Visualize the distribution of numerical features
plt.figure(figsize=(10, 6))
sns.histplot(df['RS_E_InAirTemp_PC1'], bins=20, kde=True)
plt.title('Distribution of RS_E_InAirTemp_PC1')
plt.xlabel('RS_E_InAirTemp_PC1')
plt.ylabel('Frequency')
plt.show()

# Correlation heatmap for numerical features
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

# Pair plot for selected numerical features
numerical_features = ['lat', 'lon', 'RS_E_InAirTemp_PC1', 'RS_E_OilPress_PC1', 'RS_E_RPM_PC1']
sns.pairplot(df[numerical_features])
plt.suptitle('Pair Plot of Selected Numerical Features', y=1.02)
plt.show()


In [None]:
#Preprocessing - Handling Missing Values, Outliers, and Data Types
# Convert 'timestamps_UTC' to datetime format
df['timestamps_UTC'] = pd.to_datetime(df['timestamps_UTC'])

# Fill missing numerical values with the mean
df.fillna(df.mean(), inplace=True)

# Convert columns to appropriate data types
df['mapped_veh_id'] = df['mapped_veh_id'].astype('category')


In [None]:
#Outlier Handling using Z-score
# Detect and remove outliers for 'RS_E_InAirTemp_PC1'
z_scores = zscore(df['RS_E_InAirTemp_PC1'])
outliers = (z_scores > 3) | (z_scores < -3)
df = df[~outliers]


In [None]:
#Feature Engineering
# Create a new feature representing the hour of the day
df['hour_of_day'] = df['timestamps_UTC'].dt.hour

# Impute missing values using advanced methods (e.g., interpolation)
df.interpolate(method='linear', inplace=True)

# One-hot encoding for 'mapped_veh_id'
df = pd.get_dummies(df, columns=['mapped_veh_id'], prefix='veh_id')

# Scaling numerical features
scaler = StandardScaler()
numerical_columns = ['lat', 'lon', 'RS_E_InAirTemp_PC1', 'RS_E_OilPress_PC1', 'RS_E_RPM_PC1']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
