In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from scipy.stats import pearsonr, spearmanr
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

print("Setup complete")

file = "cars.data"

#### Створення dataframe з csv файлу

In [2]:
def create_numpy_arr(file1):
    data = np.genfromtxt(file1, delimiter=',')
    return data

def create_data_frame(file1):
    header = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
    df1 = pd.read_csv(file1, names=header, low_memory=False, na_values="?")
    return df1

df = create_data_frame(file)
df_np = create_numpy_arr(file)

#### Видалення пустих даних

In [3]:
# Опрацювання пустих значень
msno.matrix(df)

df = df.dropna(subset=['num-of-doors', 'stroke', 'bore', 'normalized-losses'] )
msno.matrix(df)

#### Нормалізація даних

In [7]:
def normalize_dataset(dataset):
    normalized_dataset = dataset.copy()
    for column in normalized_dataset.columns:
        if normalized_dataset[column].dtype in ['int64', 'float64']:
            # Мін-Макс нормалізація
            min_value = normalized_dataset[column].min()
            max_value = normalized_dataset[column].max()
            normalized_dataset[column] = (normalized_dataset[column] - min_value) / (max_value - min_value)
    return normalized_dataset

df = normalize_dataset(df)

In [8]:
def plot_histogram(dataset, attribute, num_bins=10):
    min_value = dataset[attribute].min()
    max_value = dataset[attribute].max()
    bin_width = (max_value - min_value) / num_bins
    bins = [min_value + i * bin_width for i in range(num_bins + 1)]
    
    plt.hist(dataset[attribute], bins=bins, edgecolor='black')
    
    plt.title(f'Histogram of {attribute}')
    plt.xlabel(attribute)
    plt.ylabel('Frequency')
    plt.grid(True)
    
    plt.show()

def plot_attribute_relation(dataset, x_attribute, y_attribute,):
    sns.regplot(data=dataset ,x=x_attribute , y=y_attribute)
    
    plt.title(f'{y_attribute} vs {x_attribute}')
    plt.xlabel(x_attribute)
    plt.ylabel(y_attribute)
    plt.grid(True)
    
    plt.show()

In [9]:
plot_histogram(df, 'engine-size', num_bins=10)
plot_histogram(df, 'price', num_bins=10)
plot_attribute_relation(df, 'price','engine-size',)

plot_histogram(df, 'highway-mpg', num_bins=16)
plot_histogram(df, 'horsepower', num_bins=16)
plot_attribute_relation(df, 'highway-mpg','horsepower')

In [10]:
def calculate_correlation(dataset, x_attribute, y_attribute):
    x_values = dataset[x_attribute]
    y_values = dataset[y_attribute]
    
    pearson_corr, _ = pearsonr(x_values, y_values)
    
    spearman_corr, _ = spearmanr(x_values, y_values)

    print(f"Correlation coefficient: {x_attribute}, {y_attribute}")
    print("\tPearson:", pearson_corr)
    print("\tSpearman:", spearman_corr,'\n')

    return pearson_corr, spearman_corr

calculate_correlation(df, 'engine-size', 'price')
calculate_correlation(df, 'horsepower', 'highway-mpg')

calculate_correlation(df, 'peak-rpm', 'horsepower')
calculate_correlation(df, 'horsepower', 'curb-weight')

In [11]:
def one_hot_encode_categorical(dataset, categorical_columns1):
    
    categorical_data = dataset[categorical_columns1]

    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    onehot_encoded = encoder.fit_transform(categorical_data)


    encoder.fit(categorical_data)
    
    encoded_data = encoder.transform(categorical_data)
    
    encoded_columns = encoder.get_feature_names_out(categorical_columns1)
    print(encoded_columns)
    
    encoded_df1 = df.drop(columns=categorical_columns1, axis=1)

    encoded_df1.loc[:, encoded_columns] = encoded_data

    return encoded_df1

categorical_columns = ['fuel-type', 'drive-wheels', 'body-style', 'make', 'aspiration', 
                       'num-of-doors', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']
encoded_df = one_hot_encode_categorical(df, categorical_columns)

In [12]:
df_test = encoded_df[['curb-weight', 'engine-size', 'height', 'width', 'length', 'peak-rpm']]

corr = encoded_df[['curb-weight', 'engine-size', 'height', 'width', 'length', 'peak-rpm']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

sns.pairplot(df_test, kind='hist')
plt.title('Pairplot with Regression Lines')
plt.show()

sns.pairplot(df_test, diag_kind='kde')
plt.title('Pairplot with Kernel Density Estimates')
plt.show()

In [13]:
df_train, df_test = train_test_split(df, test_size=0.5, random_state=42)

def train_and_evaluate_regression_model(feature, target, df_train1, df_test1):

    X_train = df_train1[feature]
    y_train = df_train1[target]

    X_test = df_test1[feature]
    y_test = df_test1[target]
    
    model = LinearRegression()

    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("Mean Squared Error:", mse)
  
    print("Coefficients: \n", model.coef_)
    print("Coefficient of determination: %.2f" % r2)

    if len(feature) == 1:
        plt.scatter(X_train, y_train, label='Data')
        plt.plot(X_test, y_pred, color='red', linewidth=2, label='Regression Line')
        plt.xlabel(feature)
        plt.ylabel(target)
        plt.title('Linear Regression Model')
        plt.text(0.02, 0.8, f'MSE: {mse:.5f}', transform=plt.gca().transAxes, fontsize=12)
        plt.text(0.02, 0.7, f'r2: {r2}', transform=plt.gca().transAxes, fontsize=12)
        plt.text(0.02, 0.6, f'cof: {model.coef_[0]:.5f}', transform=plt.gca().transAxes, fontsize=12)
        plt.legend()
        plt.show()

    return model

In [14]:
train_and_evaluate_regression_model(['horsepower'], 'price', df_train, df_test)
train_and_evaluate_regression_model(['peak-rpm'], 'price', df_train, df_test)
train_and_evaluate_regression_model(['engine-size','horsepower','highway-mpg', 'curb-weight', 'symboling'], 'price', df_train, df_test)