In [None]:
# Car Evaluation Data - EDA and Model Comparison

# Step 1: Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 2: Load the data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/19/car.data"
column_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df = pd.read_csv(url, names=column_names)

# Step 3: Data Exploration
print(f"Shape of the dataset: {df.shape}")
print(f"Data Types:\n{df.info()}")
print(f"Missing Values:\n{df.isnull().sum()}")
print(f"Target Class Distribution:\n{df['class'].value_counts()}")

# Step 4: Data Cleaning (Encoding Categorical Features)
le = LabelEncoder()
df_encoded = df.apply(le.fit_transform)

# Step 5: Visualizations
# 5.1 Correlation Heatmap (after encoding)
plt.figure(figsize=(10, 8))
sns.heatmap(df_encoded.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# 5.2 Visualize Target vs Features (Example: Target vs 'buying')
sns.countplot(x='buying', hue='class', data=df_encoded)
plt.title("Buying vs Target Class")
plt.show()

# Step 6: Split Data into Train and Test Sets
X = df_encoded.drop('class', axis=1)
y = df_encoded['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Model Comparison
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    print("-" * 40)

# Optionally, visualize the feature importance for Random Forest
importances = models['Random Forest'].feature_importances_
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# Plot feature importance
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title("Feature Importance (Random Forest)")
plt.show()
