In [59]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

# Load train and test datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Checking information of the train data
train_df.info()

# Initialize LabelEncoder and dictionary to store encoders for each column
label_encoders = {}

# List of categorical columns to encode
categorical_columns = ['model', 'motor_type', 'wheel', 'color', 'status', 'type']

# Apply LabelEncoder to each categorical column for both train and test
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    train_df[col] = label_encoders[col].fit_transform(train_df[col])
    test_df[col] = test_df[col].apply(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)

# Function to convert running values from 'km' to 'miles'
def convert_running(value):
    if 'km' in value:
        kilometers = int(value.split()[0])
        miles = kilometers * 0.621371
        return f"{miles:.2f} miles"
    elif 'miles' in value:
        return value
    else:
        return value 

# Apply the conversion for both train and test datasets
train_df['running'] = train_df['running'].apply(convert_running)
test_df['running'] = test_df['running'].apply(convert_running)

# Clean and convert 'running' column to numeric values
train_df['running'] = train_df['running'].str.replace('miles', '').str.strip()
test_df['running'] = test_df['running'].str.replace('miles', '').str.strip()
train_df['running'] = pd.to_numeric(train_df['running'], errors='coerce').fillna(0).astype('int64')
test_df['running'] = pd.to_numeric(test_df['running'], errors='coerce').fillna(0).astype('int64')

# Drop duplicates from the training data
train_df = train_df.drop_duplicates()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   object 
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null   object 
 3   running       1642 non-null   object 
 4   wheel         1642 non-null   object 
 5   color         1642 non-null   object 
 6   type          1642 non-null   object 
 7   status        1642 non-null   object 
 8   motor_volume  1642 non-null   float64
 9   price         1642 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 128.4+ KB


In [61]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1633 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1633 non-null   int32  
 1   year          1633 non-null   int64  
 2   motor_type    1633 non-null   int32  
 3   running       1633 non-null   int64  
 4   wheel         1633 non-null   int32  
 5   color         1633 non-null   int32  
 6   type          1633 non-null   int32  
 7   status        1633 non-null   int32  
 8   motor_volume  1633 non-null   float64
 9   price         1633 non-null   int64  
dtypes: float64(1), int32(6), int64(3)
memory usage: 102.1 KB


In [63]:
# Prepare features and target variable
X = train_df.drop(['price', 'wheel'], axis = 1)  # Replace 'target_column' with your actual target column name
y = train_df['price']  # Your target variable

In [21]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_df.drop(columns=['Id', 'wheel']))  # Adjust as necessary

In [25]:
# Define the deep learning model
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [27]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [29]:
# Fit the model
history = model.fit(X_train_scaled, y_train, validation_data=(X_val_scaled, y_val), epochs=100, batch_size=32, verbose=1)

Epoch 1/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 311613728.0000 - mae: 16031.4453 - val_loss: 300069248.0000 - val_mae: 15914.3467
Epoch 2/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 301242624.0000 - mae: 15745.3818 - val_loss: 298026848.0000 - val_mae: 15853.5029
Epoch 3/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 292682656.0000 - mae: 15673.3926 - val_loss: 286219168.0000 - val_mae: 15514.4980
Epoch 4/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 279607776.0000 - mae: 15327.1221 - val_loss: 245818224.0000 - val_mae: 14332.2568
Epoch 5/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 227657920.0000 - mae: 13736.5244 - val_loss: 160278848.0000 - val_mae: 11401.1094
Epoch 6/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 137852128.0000 - mae: 10278.0410

In [31]:
# Evaluate the model on the validation set
val_loss, val_mae = model.evaluate(X_val_scaled, y_val)
print(f'Validation Mean Absolute Error: {val_mae:.2f}')

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 23896292.0000 - mae: 3079.6794
Validation Mean Absolute Error: 3035.30


In [33]:
# Make predictions on the test set
y_predict = model.predict(X_test_scaled)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [35]:
# Make predictions on the test set
y_predict = model.predict(X_test_scaled)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [37]:
# Make predictions on the test set
y_predict = model.predict(X_test_scaled)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [39]:
# Create a DataFrame for the predictions
df_predictions = pd.DataFrame(y_predict, columns=['price'])

In [41]:
# Display predictions
print(df_predictions)

            price
0    20647.322266
1    17105.957031
2    19633.708984
3    16339.483398
4    10164.221680
..            ...
406  23818.511719
407  13494.992188
408  10414.528320
409  18601.970703
410  12762.489258

[411 rows x 1 columns]


In [48]:
combined_df = pd.concat([test_df['Id'], df_predictions], axis=1)

combined_df

Unnamed: 0,Id,price
0,0,20647.322266
1,1,17105.957031
2,2,19633.708984
3,3,16339.483398
4,4,10164.221680
...,...,...
406,406,23818.511719
407,407,13494.992188
408,408,10414.528320
409,409,18601.970703


In [56]:
# Save predictions to a CSV file if needed
combined_df.to_csv('prediii.csv', index=False)