In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Step 1: Load the data
data = pd.read_csv('C:\\Users\\KMS9BAN\\Desktop\\fwdprojectstobesharedwithd49d50\\cars_price.csv')

In [None]:
# Step 2: Split the data into training and test sets
X = data.drop('price', axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train, X_test, y_train, y_test)

In [None]:
import numpy as np

# Handle missing values

# For numeric columns with missing values, you can use imputation techniques like mean or median
X_train.loc[:, 'normalized-losses'] = X_train['normalized-losses'].replace('?', np.nan).astype(float)

# For categorical columns with missing values, you can either impute with the most frequent value or drop the rows/columns
X_train['num-of-doors'].fillna(X_train['num-of-doors'].mode()[0], inplace=True)

print(X_train)



In [None]:
# Encoding categorical variables using one-hot encoding
encoder = OneHotEncoder()
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']]).toarray(),
                               columns=encoder.get_feature_names(['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']))
X_train_encoded

In [None]:
# Concatenate the encoded features with the numerical features
X_train_preprocessed = pd.concat([X_train_encoded, X_train.select_dtypes(include=['float64', 'int64'])], axis=1)
X_train_preprocessed

In [None]:
# Step 4: Apply different machine learning techniques
# Initialize the model
model = LinearRegression()
print(model)

In [None]:
# Identify the mismatched rows
mismatched_rows = set(X_train_preprocessed.index) - set(y_train.index)

# Drop the mismatched rows from X_train_preprocessed and y_train
X_train_preprocessed.drop(mismatched_rows, inplace=True)
y_train = y_train.loc[X_train_preprocessed.index]

# Verify the shapes of the arrays
print("X_train_preprocessed shape:", X_train_preprocessed.shape)
print("y_train shape:", y_train.shape)


In [None]:
# Handle missing values in X_train_preprocessed
imputer = SimpleImputer(strategy='mean')
X_train_preprocessed = imputer.fit_transform(X_train_preprocessed)

# Create a new DataFrame from X_train_preprocessed
X_train_preprocessed = pd.DataFrame(X_train_preprocessed)

# Identify the mismatched rows
mismatched_indices = set(X_train_preprocessed.index) - set(np.arange(len(y_train)))

# Drop the mismatched rows from X_train_preprocessed
X_train_preprocessed.drop(list(mismatched_indices), inplace=True)

# Remove the mismatched rows from y_train
y_train = np.delete(y_train, list(mismatched_indices))

# Handle missing values in X_train_preprocessed again after dropping rows
imputer = SimpleImputer(strategy='mean')
X_train_preprocessed = imputer.fit_transform(X_train_preprocessed)

# Standardize the features
scaler = StandardScaler()
X_train_preprocessed = scaler.fit_transform(X_train_preprocessed)

# Check the number of samples in X_train_preprocessed and y_train
if len(X_train_preprocessed) != len(y_train):
    raise ValueError("Inconsistent number of samples between X_train_preprocessed and y_train.")

# Create the model
model = LinearRegression()

# Build the final model
model.fit(X_train_preprocessed, y_train)

In [None]:
# Preprocess the test data in a similar way as the training data
X_test['normalized-losses'].fillna(X_train['normalized-losses'].mean(), inplace=True)
X_test['num-of-doors'].fillna(X_train['num-of-doors'].mode()[0], inplace=True)

# Handle unknown categories in the encoder
encoder.set_params(handle_unknown='ignore')

# Encode the categorical features in X_test
X_test_encoded = pd.DataFrame(encoder.transform(X_test[['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']]).toarray(),
                              columns=encoder.get_feature_names(['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']))

# Create a copy of the numeric features in X_test
X_test_numeric = X_test.select_dtypes(include=['float64', 'int64']).copy()

# Concatenate the encoded features with the numeric features
X_test_preprocessed = pd.concat([X_test_encoded, X_test_numeric], axis=1)

# Handle missing values in X_test_preprocessed
X_test_preprocessed['num-of-cylinders_three'] = 0  # Add missing column with zeros
X_test_preprocessed = imputer.transform(X_test_preprocessed)

# Standardize the features in X_test_preprocessed
X_test_preprocessed = scaler.transform(X_test_preprocessed)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_preprocessed)
print(y_pred)

In [None]:
# Preprocess the test data in a similar way as the training data
X_test['normalized-losses'].fillna(X_train['normalized-losses'].mean(), inplace=True)
X_test['num-of-doors'].fillna(X_train['num-of-doors'].mode()[0], inplace=True)

# Handle unknown categories in the encoder
encoder.set_params(handle_unknown='ignore')

X_test_encoded = pd.DataFrame(encoder.transform(X_test[['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']]).toarray(),
                              columns=encoder.get_feature_names(['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']))

# Concatenate the encoded features with the numeric features
X_test_preprocessed = pd.concat([X_test_encoded, X_test.select_dtypes(include=['float64', 'int64']).copy()], axis=1)

# Handle missing values in X_test_preprocessed
X_test_preprocessed = pd.DataFrame(imputer.transform(X_test_preprocessed), columns=X_test_preprocessed.columns)

# Standardize the features in X_test_preprocessed
X_test_preprocessed = pd.DataFrame(scaler.transform(X_test_preprocessed), columns=X_test_preprocessed.columns)

# Ensure the number of samples in X_test_preprocessed and y_test is consistent
if len(X_test_preprocessed) != len(y_test):
    raise ValueError("Inconsistent number of samples between X_test_preprocessed and y_test.")

# Calculate predictions
y_pred = model.predict(X_test_preprocessed)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)