In [None]:
!pip install seaborn
!pip install scikit-learn

# **Imports**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
data_url = 'dataset/CarPrice_Assignment.csv'
data = pd.read_csv(data_url)

# **Exploratory Data Analysis**

In [None]:
print(data.head())

In [None]:
# Check for missing values
print(data.isnull().sum())

In [None]:
print(data.describe())

In [None]:
# Distribution of Car Prices
plt.figure(figsize=(10, 6))
sns.histplot(data['price'], kde=True)
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Box Plot of Car Prices by Horespower
plt.figure(figsize=(12, 8))
sns.boxplot(data=data, x='horsepower', y='price')
plt.title('Box Plot of Car Prices by Horsepower')
plt.xlabel('Horsepower')
plt.ylabel('Price')
plt.show()

In [None]:
# Pairplot of Selected Features
selected_features = ['price', 'horsepower', 'curbweight', 'enginesize', 'highwaympg']
sns.pairplot(data[selected_features])
plt.show()

# **Machine Learning Techniques**

In [None]:
# Selecting features and target variable
features = ['horsepower', 'curbweight', 'enginesize', 'highwaympg']
X = data[features]
y = data['price']

X = X.fillna(X.mean())
y = y.fillna(y.mean())

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler # Import the StandardScaler class from sklearn.preprocessing

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Predicting on the test set
y_pred = model.predict(X_test)

In [None]:
# Calculating evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

# **Data Visualization Techniques**

In [None]:
  # Actual vs Predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.plot([0, max(y_test)], [0, max(y_test)], '--r')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Car Prices')
plt.show()

In [None]:
# Residual plot
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Calculate residuals
residuals = y_test - y_pred
# Residual Plot vs Fitted Values
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot vs Fitted Values')
plt.show()


In [None]:
# Feature vs Target Scatter Plots
plt.figure(figsize=(12, 10))
for i, feature in enumerate(features, 1):
    plt.subplot(2, 2, i)
    plt.scatter(data[feature], data['price'])
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.title(f'{feature} vs Price')
plt.tight_layout()
plt.show()


In [None]:
# Distribution of Each Feature
plt.figure(figsize=(12, 10))
for i, feature in enumerate(features, 1):
    plt.subplot(2, 2, i)
    sns.histplot(data[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


In [None]:
# Feature importance
importance = model.coef_
importance_df = pd.DataFrame({'Feature': features, 'Coefficient': importance})

plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=importance_df, palette='viridis')
plt.title('Feature Importance')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.show()


# ***PRICE PREDICTION HERE***

In [None]:
# Price predictor Function

def predict_price(horsepower, curbweight, enginesize, highwaympg):
    # Create a DataFrame with the input features
    input_data = pd.DataFrame([[horsepower, curbweight, enginesize, highwaympg]],
                              columns=['horsepower', 'curbweight', 'enginesize', 'highwaympg'])

    # Predict the price using the trained model
    predicted_price = model.predict(input_data)
    return predicted_price[0]

In [None]:
# Usage:
predicted_price = predict_price(horsepower=150, curbweight=3000, enginesize=2.5, highwaympg=25)
print(f'Predicted Car Price: ${predicted_price:.2f}')