In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv"
data = pd.read_csv(url)

In [None]:
data.head()

# **Features**

In [None]:
selected_features = [
    'Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
    'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP'
]
data = data[selected_features]

In [None]:
data

# **Data preparation**

In [None]:
data.columns = data.columns.str.replace(' ', '_').str.lower()

# Fill missing values with 0
data.fillna(0, inplace=True)

In [None]:
# Rename MSRP to price
data.rename(columns={'msrp': 'price'}, inplace=True)
data

# **Question 1: Most Frequent Transmission Type**
What is the most frequent observation (mode) for the column transmission_type?

In [None]:
most_frequent_transmission = data['transmission_type'].mode()[0]
print("The most frequent observation for 'transmission_type' is:", most_frequent_transmission)

# **Question 2: Correlation Matrix**

In [None]:
# Calculate the correlation matrix for the numerical features
numerical_features = data[['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']]
correlation_matrix = numerical_features.corr()

# Adjusting the matrix style for better visualisation
plt.figure(figsize=(10, 6))
sns.set(font_scale=1.2)
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation matrix of numerical features")
plt.show()

# Let's find two traits with the highest correlation
max_corr = correlation_matrix.abs().unstack().sort_values(ascending=False)
max_corr = max_corr[max_corr != 1] 
feature1, feature2 = max_corr.index[0]

print(f"Two traits with the highest correlation: '{feature1}' и '{feature2}' with a correlation coefficient {correlation_matrix.loc[feature1, feature2]}")

# **Make price binary**

In [None]:
# Create a binary variable above_average
data['above_average'] = (data['price'] > data['price'].mean()).astype(int)

In [None]:
data.head()

# **Split the data**

In [None]:
# Split the data into train, validation, and test sets
X = data.drop(columns=['above_average'])
y = data['above_average']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
# Display the shapes of the resulting sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Test set shape:", X_test.shape, y_test.shape)

# **Question 3**

In [None]:
# Define the list of categorical features
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']

# Remove the 'price' column from X_train if it's mistakenly included
if 'price' in X_train.columns:
    X_train.drop(columns=['price'], inplace=True)

# One-hot encode the categorical features
encoder = OneHotEncoder(sparse=False, drop='first')  # You can customize drop parameter as needed
X_train_encoded = encoder.fit_transform(X_train[categorical_features])

# Calculate mutual information scores for one-hot encoded categorical features
mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features=True, random_state=42)

# Round the scores to 2 decimals
rounded_mi_scores = [round(score, 2) for score in mi_scores]

# Find the variable with the lowest mutual information score
lowest_mi_score_index = rounded_mi_scores.index(min(rounded_mi_scores))
print(f'Response - {categorical_features[lowest_mi_score_index]}')


# **Question 4:**

In [None]:
# Concatenate the training and validation datasets vertically
X_combined = pd.concat([X_train, X_val], axis=0)
y_combined = pd.concat([y_train, y_val], axis=0)

# Define the list of categorical features
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']

# Initialize an empty encoder to store categories
encoder = OneHotEncoder(categories='auto', sparse=False)

# Fit the encoder on the combined data
X_combined_encoded = encoder.fit_transform(X_combined[categorical_features])

# Split the combined dataset back into training and validation parts
X_train_encoded = X_combined_encoded[:len(X_train)]
X_val_encoded = X_combined_encoded[len(X_train):]

# Create a pipeline with Logistic Regression
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

# Fit the model on the training data
model.fit(X_train_encoded, y_train)

# Predict on the validation data
y_pred = model.predict(X_val_encoded)

# Calculate accuracy on the validation dataset
accuracy = accuracy_score(y_val, y_pred)
print(f'Response - {round(accuracy, 2)}')


# **Question 5:**

In [None]:
# Concatenate the training and validation datasets vertically
X_combined = pd.concat([X_train, X_val], axis=0)
y_combined = pd.concat([y_train, y_val], axis=0)

# Define the list of categorical features
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']

# Initialize an empty encoder to store categories
encoder = OneHotEncoder(categories='auto', sparse=False)

# Fit the encoder on the combined data
X_combined_encoded = encoder.fit_transform(X_combined[categorical_features])

# Split the combined dataset back into training and validation parts
X_train_encoded = X_combined_encoded[:len(X_train)]
X_val_encoded = X_combined_encoded[len(X_train):]

# Create a list of features to exclude one at a time
features_to_exclude = ['year', 'engine_hp', 'transmission_type', 'city_mpg']

# Initialize a dictionary to store accuracy differences
accuracy_differences = {}

# Train and evaluate models with each feature excluded
for feature in features_to_exclude:
    # Create a copy of the dataset with the feature excluded
    X_train_exclude = X_train_encoded.copy()
    X_val_exclude = X_val_encoded.copy()
    
    # Find the index of the feature to be excluded
    feature_index = X_train.columns.get_loc(feature)
    
    # Remove the corresponding column from the copied datasets
    X_train_exclude = np.delete(X_train_exclude, feature_index, axis=1)
    X_val_exclude = np.delete(X_val_exclude, feature_index, axis=1)
    
    # Create and fit the logistic regression model without the feature
    model_exclude = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model_exclude.fit(X_train_exclude, y_train)
    
    # Calculate accuracy without the feature
    y_pred_exclude = model_exclude.predict(X_val_exclude)
    accuracy_exclude = accuracy_score(y_val, y_pred_exclude)
    
    # Calculate accuracy difference
    accuracy_difference = accuracy - accuracy_exclude
    
    # Store accuracy difference in the dictionary
    accuracy_differences[feature] = accuracy_difference

# Find the feature with the smallest difference
smallest_difference_feature = min(accuracy_differences, key=accuracy_differences.get)
print(f'Response - {smallest_difference_feature}')


# **Question 6:**

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd

# Concatenate the training and validation datasets vertically
X_combined = pd.concat([X_train, X_val], axis=0)
y_combined = pd.concat([y_train, y_val], axis=0)

# Define the list of categorical features
categorical_features = ['make', 'model', 'transmission_type', 'city_mpg']

# Initialize an encoder to store categories
encoder = OneHotEncoder(categories='auto', sparse=False)

# Fit the encoder on the combined data
X_combined_encoded = encoder.fit_transform(X_combined[categorical_features])

# Split the combined dataset back into training and validation parts
X_train_encoded = X_combined_encoded[:len(X_train)]
X_val_encoded = X_combined_encoded[len(X_train):]

# Apply logarithmic transformation to price
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Initialize a dictionary to store RMSE scores
rmse_scores = {}

# Try different alpha values for Ridge regression
alphas = [0, 0.01, 0.1, 1, 10]

for alpha in alphas:
    # Create and fit the Ridge regression model
    ridge_model = Ridge(alpha=alpha, solver='sag', random_state=42)
    ridge_model.fit(X_train_encoded, y_train_log)
    
    # Predict on the validation set and convert predictions back to original scale
    y_pred_log = ridge_model.predict(X_val_encoded)
    y_pred = np.expm1(y_pred_log)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(np.expm1(y_val_log), y_pred))
    
    # Store RMSE in the dictionary
    rmse_scores[alpha] = round(rmse, 3)

# Print RMSE scores for all alpha values
for alpha, rmse in rmse_scores.items():
    print(f"Alpha = {alpha}: RMSE = {rmse}")

best_alpha = min(rmse_scores, key=rmse_scores.get)
print(f'Response - {best_alpha}')