In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn.model_selection import train_test_split

%pip install category_encoders
import category_encoders as ce

import sklearn
import os
import re

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

## load dataset 

In [None]:
# Loading in Kaggle's sample submission file for uploading
submission = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
# Applying predicted 'price' values to 'submission'
# submission['price'] = y_pred
# Exporting 'submission' as a .csv for scoring
# submission.to_csv('submission.csv', index=False)

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
print(train.info())
train.head()

In [None]:
def nullValuesCols(df):
    columns_with_null = df.columns[df.isnull().any()]
    print(f'Total null values in the dataset {df.isnull().sum().sum()}')
    print(f'columns {columns_with_null}')
    for col in columns_with_null:
        print(f' -- Null value for column {col}: {df[col].isnull().sum()}')
        
    print('\n\n')
    
    
def uniqueVlaues(df):   
    object_cols = df.select_dtypes(include = ['object']).columns
    print('All the unique values in the dataset')
    for cols in object_cols:
        if df[f'{cols}'].nunique() > 10:
            print(f" -- {cols} has: {df[cols].nunique()}, too many to list")
        else:
            plt.figure(figsize=(10, 3))
            sns.histplot(df[cols], kde = False) #, bins = 5)
            plt.title(f" {cols} has:\n {df[cols].unique()}")
            plt.xlabel(f'{cols}')
            plt.ylabel('Frequency')
            plt.show()

        print('\n\n')      
    
    
def dataExploration(df):

    if df.isnull().sum().sum() >0:
        nullValuesCols(df)
        
    has_objectColumns = any(df.dtypes == 'object')
    if has_objectColumns:
        uniqueVlaues(df)

    
    
    return 

dataExploration(train)

- perfom anova on the large grouped varibale and if there is difference between them the perfoom one the encpding https://www.kaggle.com/code/arashnic/an-overview-of-categorical-encoding-methods 
- https://www.tutorialspoint.com/correlation-between-categorical-and-continuous-variables#:~:text=Now%2C%20if%20you%20want%20to,the%20target%20column%20is%20categorical. to check anova 

In [None]:

def fill_null_with_median_or_mode(df):
    print('Before filling the null values:', df.isnull().sum().sum())
    
    # Loop through each column in the DataFrame
    for column in df.columns:
        if df[column].dtype.name in ['float64', 'int64']:
            # For numerical columns, use median
            median_value = df[column].median()
            df[column] = df[column].fillna(median_value)
        elif df[column].dtype == 'object':
            # For object (categorical) columns, use mode
            mode_value = df[column].mode().iloc[0]  # Get the first mode if there are multiple
            df[column] = df[column].fillna(mode_value)
    
    print('After filling the null values:', df.isnull().sum().sum())
    
    return df


def feature_engineering(dataframe):
    df = dataframe.copy()
    ## extract important features 
    df['horse_power'] = df['engine'].str.extract(r'(\d+\.\d+)HP')
    df['horse_power'] = df['horse_power'].astype(float)
    
    df['liters'] = df['engine'].str.extract(r'(\d+\.\d+)L')
    df['liters'] = df['liters'].astype(float)
    
    df['cylinders'] = df['engine'].str.extract(r'(\d+) Cylinder')
    df['cylinders'] = df['cylinders'].astype(float)
    
    df['turbo'] = df['engine'].str.extract(r'(turbo\w*)' , flags=re.IGNORECASE)
    df['turbo'] = np.where(df['turbo'] == 'Turbo', 1, 0)
    
    df['clean_title'] = np.where(df['clean_title'] == 'Yes', 1, 0)
    df['accident'] = np.where(df['accident'] == 'At least 1 accident or damage reported', 1, 0)
    
    ## prepare new features indicating car features
    df['car_age'] = 2024 - df['model_year']
    df['Power_to_Weight_Ratio'] = df['horse_power'] / df['liters']
    df['milage_per_year'] = np.where(df['car_age'] == 0, df['milage'], df['milage'] / df['car_age'])
    
    df.drop(columns = ['engine', 'model_year'], inplace = True)
    
    df = fill_null_with_median_or_mode(df)
    
    return df
    
train_df = feature_engineering(train)
test_df = feature_engineering(test)


In [None]:
train_df.head()


- Target Encoding - Mean Likelihood Encoding ,"The Right Way !"
Mean encoding means replacing the category with the mean target value for that category. We start by grouping each category alone, and for each group, we calculate the mean of the target in the corresponding observations. Then we assign that mean to that category. Thus, we encoded the category with the mean of the target. Here’s a detailed illustration of mean encoding.
- P-value ≤ α: The differences between some of the means are statistically significant
If the p-value is less than or equal to the significance level, you reject the null hypothesis and conclude that not all population means are equal. Use your specialized knowledge to determine whether the differences are practically significant. For more information, go to Statistical and practical significance.
- P-value > α: The differences between the means are not statistically significant
If the p-value is greater than the significance level, you do not have enough evidence to reject the null hypothesis that the population means are all equal. Verify that your test has enough power to detect a difference that is practically significant. For more information, go to Increase the power of a hypothesis test.
- Why Target Encoding in This Case?
If you find that the means of different groups (e.g., races in your example) are significantly different with respect to a continuous target variable, it suggests that the categorical variable (e.g., race) has a meaningful relationship with the target. In such cases, target encoding can help translate this relationship into a numerical form that a machine learning model can understand.]]

https://mode.com/blog/violin-plot-examples

In [None]:
def anovaVisualisation(df, colName):
    df = pd.DataFrame(df)
    groups = df.groupby(colName[0])
    group_values = []

    for name, group in groups:
        group_values.append(group[colName[1]])
    
    f_stat, p_value = stats.f_oneway(*group_values)
    if p_value <= 0.05:
        text = f"{colName[0]} --> ANOVA F-statistic: {f_stat}, \n  p-value: {p_value}\n Perform mean target encoding"
    else:
        text = f"{colName[0]} --> ANOVA F-statistic: {f_stat}, \n p-value: {p_value}\n Perform frequency encoding"

    
    # Visualization: violin plot to visualize the distribution of the target variable across the groups
    plt.figure(figsize=(20, 8))
    sns.violinplot(x = df[colName[0]], y = df[colName[1]], data = df);
    plt.title(f"{text}")
    plt.xlabel(colName[0])
    plt.ylabel(colName[1])
    plt.yscale('log')
    _ = plt.xticks(rotation=45, ha='right')
    plt.show()

# anovaVisualisation(train[['fuel_type', 'price']], ['fuel_type', 'price'])

In [None]:
object_cols = train_df.select_dtypes(include = ['object']).columns

for col in object_cols:
    anovaVisualisation(train_df[[col, 'price']], [col, 'price'])



## correlation between the features 

In [None]:
def featureCorr(df):
    x = df.drop(['id','price'], axis =1)
    y = df['price']
    print("perform target encoding")
    objectCols = df.select_dtypes(include = ['object']).columns
    _encoder = ce.TargetEncoder(cols = objectCols)
    _encoder.fit(x, y)

    # Transform both training and testing sets
    en_df = _encoder.transform(x)
    en_df['price'] = df['price']
    
    corr = en_df.corr()
    plt.figure(figsize=(15, 8))
    sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values,
                cmap = "YlGnBu", annot = True, fmt='.2g')
    plt.title("pearson Correlation Heatmap", fontsize=16)
    plt.show();
    

featureCorr(train_df)
    

## model training section 

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()


def encoding_scaling(datasets, _type = None):
    df = datasets[0]
    test = datasets[1].drop(['id'], axis = 1)
    x = df.drop(['id', 'price'], axis = 1)
    y = df['price']
    
    # Split into training and testing sets
    X_train, X_val, y_train, y_val = train_test_split(x, y, test_size = 0.3, random_state = 42)

    
    object_cols = df.select_dtypes(include = ['object']).columns
    if _type == 'target_encoding':
        encoder = ce.TargetEncoder(cols = object_cols)
    elif _type == 'frequency_encoding':
        encoder = ce.CountEncoder(cols = object_cols)
    else:
        encoder = ce.TargetEncoder(cols = object_cols)
        
    encoder.fit(X_train, y_train)
    
    X_train = encoder.transform(X_train)
    X_val = encoder.transform(X_val)
    test = encoder.transform(test)
    
    scaler = MinMaxScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = x.columns)
    X_val = pd.DataFrame(scaler.fit_transform(X_val), columns = x.columns)
    test = pd.DataFrame(scaler.fit_transform(test), columns = test.columns)
    
    
    return (X_train, X_val, y_train, y_val), test

train, test = encoding_scaling([train_df, test_df])

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def model_prediction_regression(model, x_train, x_test, y_train, y_test):
    # Fit the model
    model.fit(x_train, y_train)
    
    # Predictions for both training and testing sets
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    # Evaluation Metrics
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    
    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)
    
    # Printing Evaluation Metrics
    print(f"Training Metrics for {model} model:")
    print(f"Mean Squared Error (MSE): {mse_train}")
    print(f"Mean Absolute Error (MAE): {mae_train}")
    print(f"R-Squared (R²): {r2_train}")
    print(f"Root Mean Squared Error (RMSE): {rmse_train}")
    
    print("\nTesting Metrics for {model} model:")
    print(f"Mean Squared Error (MSE): {mse_test}")
    print(f"Mean Absolute Error (MAE): {mae_test}")
    print(f"R-Squared (R²): {r2_test}")
    print(f"Root Mean Squared Error (RMSE): {rmse_test}")
    
    # Visualizing predictions vs true values
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_test_pred, color="blue", label="Predicted vs Actual")
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, label="Ideal Fit")
    plt.title(f"Predictions vs True Values for {model}")
    plt.xlabel("True Values")
    plt.ylabel("Predictions")
    plt.legend()
    plt.show()
    
    return model


In [None]:
from sklearn.linear_model import LinearRegression

# Example: Using Linear Regression
Linear_model = LinearRegression()
model = model_prediction_regression(Linear_model, *train)
submission['price'] = model.predict(test)
submission.to_csv('LinearSubmission.csv', index = False)
submission.head()

In [None]:
from lightgbm import LGBMRegressor
# Create an instance of the LightGBM Regressor with the RMSE metric. 
lgbm_model = LGBMRegressor(metric='rmse') 
  
# Train the model using the training data. 
model = model_prediction_regression(lgbm_model, *train)
submission['price'] = model.predict(test)
submission.to_csv('LgbmSubmission.csv', index = False)
submission.head()

In [None]:
import xgboost as xg 

xgb_model = xg.XGBRegressor(objective ='reg:linear', n_estimators = 10, seed = 123) 

# Train the model using the training data. 
model = model_prediction_regression(xgb_model,*train)
submission['price'] = model.predict(test)
submission.to_csv('xgbSubmission.csv', index = False)
submission.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
 
# Create a random forest regression model
rf_model = RandomForestRegressor(n_estimators=100)

# Train the model using the training data. 
model = model_prediction_regression(rf_model,*train)
submission['price'] = model.predict(test)
submission.to_csv('rfSubmission.csv', index = False)
submission.head()

## combine all the results 

In [None]:
col = []

for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        col.append(filename)
        submission[f'{filename}'] = pd.read_csv(path)['price']


submission['price'] = submission[col].mean(axis=1)
submission[['id', 'price']].to_csv('All_Submission.csv', index = False)