In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

# Load train and test datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Checking information of the train data
train_df.info()

# Initialize LabelEncoder and dictionary to store encoders for each column
label_encoders = {}

# List of categorical columns to encode
categorical_columns = ['model', 'motor_type', 'wheel', 'color', 'status', 'type']

# Apply LabelEncoder to each categorical column for both train and test
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    train_df[col] = label_encoders[col].fit_transform(train_df[col])
    test_df[col] = test_df[col].apply(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)

# Function to convert running values from 'km' to 'miles'
def convert_running(value):
    if 'km' in value:
        kilometers = int(value.split()[0])
        miles = kilometers * 0.621371
        return f"{miles:.2f} miles"
    elif 'miles' in value:
        return value
    else:
        return value 

# Apply the conversion for both train and test datasets
train_df['running'] = train_df['running'].apply(convert_running)
test_df['running'] = test_df['running'].apply(convert_running)

# Clean and convert 'running' column to numeric values
train_df['running'] = train_df['running'].str.replace('miles', '').str.strip()
test_df['running'] = test_df['running'].str.replace('miles', '').str.strip()
train_df['running'] = pd.to_numeric(train_df['running'], errors='coerce').fillna(0).astype('int64')
test_df['running'] = pd.to_numeric(test_df['running'], errors='coerce').fillna(0).astype('int64')

# Drop duplicates from the training data
train_df = train_df.drop_duplicates()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   object 
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null   object 
 3   running       1642 non-null   object 
 4   wheel         1642 non-null   object 
 5   color         1642 non-null   object 
 6   type          1642 non-null   object 
 7   status        1642 non-null   object 
 8   motor_volume  1642 non-null   float64
 9   price         1642 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 128.4+ KB


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1633 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1633 non-null   int32  
 1   year          1633 non-null   int64  
 2   motor_type    1633 non-null   int32  
 3   running       1633 non-null   int64  
 4   wheel         1633 non-null   int32  
 5   color         1633 non-null   int32  
 6   type          1633 non-null   int32  
 7   status        1633 non-null   int32  
 8   motor_volume  1633 non-null   float64
 9   price         1633 non-null   int64  
dtypes: float64(1), int32(6), int64(3)
memory usage: 102.1 KB


In [8]:
# Prepare features and target variable
X = train_df.drop(['price', 'wheel'], axis = 1)  # Replace 'target_column' with your actual target column name
y = train_df['price']  # Your target variable

In [21]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)
# X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_df.drop(columns=['Id', 'wheel']))  # Adjust as necessary

In [28]:
# Define the deep learning model
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [30]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [32]:
# Fit the model
history = model.fit(X_train_scaled, y, epochs=100, batch_size=32, verbose=1)

Epoch 1/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 302959200.0000 - mae: 15985.7383
Epoch 2/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 307714368.0000 - mae: 15978.2627
Epoch 3/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 291299360.0000 - mae: 15600.0195
Epoch 4/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 245319440.0000 - mae: 13852.9023
Epoch 5/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 111391288.0000 - mae: 8781.2734 
Epoch 6/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 55047136.0000 - mae: 5135.3042
Epoch 7/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 37980776.0000 - mae: 4442.5078
Epoch 8/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 34776280.0000 - mae: 4148.278

In [38]:
from sklearn.metrics import  mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_train_scaled)
mse = mean_squared_error(y, y_pred)
print(f'Mean Squared Error with Gradient Boosting: {mse:.2f}')
mae = mean_absolute_error(y, y_pred)
print(mae)
r2 = r2_score(y, y_pred)
print(r2)

[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 888us/step
Mean Squared Error with Gradient Boosting: 24250783.59
3018.028325993666
0.5308123826980591


In [40]:
# Evaluate the model on the validation set
val_loss, val_mae = model.evaluate(X_train_scaled, y)
print(f'Validation Mean Absolute Error: {val_mae:.2f}')

[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 917us/step - loss: 24575290.0000 - mae: 2975.9551
Validation Mean Absolute Error: 3018.03


In [43]:
# Make predictions on the test set
y_predict = model.predict(X_test_scaled)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [35]:
# Make predictions on the test set
y_predict = model.predict(X_test_scaled)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [37]:
# Make predictions on the test set
y_predict = model.predict(X_test_scaled)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [45]:
# Create a DataFrame for the predictions
df_predictions = pd.DataFrame(y_predict, columns=['price'])

In [49]:
# Display predictions
print(df_predictions)

            price
0    20743.681641
1    16943.839844
2    19892.773438
3    16618.390625
4     9382.386719
..            ...
406  23501.460938
407  13902.680664
408  10416.273438
409  18856.333984
410  12831.708984

[411 rows x 1 columns]


In [51]:
combined_df = pd.concat([test_df['Id'], df_predictions], axis=1)

combined_df

Unnamed: 0,Id,price
0,0,20743.681641
1,1,16943.839844
2,2,19892.773438
3,3,16618.390625
4,4,9382.386719
...,...,...
406,406,23501.460938
407,407,13902.680664
408,408,10416.273438
409,409,18856.333984


In [55]:
# Save predictions to a CSV file if needed
combined_df.to_csv('pree.csv', index=False)