In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers, regularizers

# Load train and test datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Checking information of the train data
train_df.info()

# Initialize LabelEncoder and dictionary to store encoders for each column
label_encoders = {}

# List of categorical columns to encode
categorical_columns = ['model', 'motor_type', 'wheel', 'color', 'status', 'type']

# Apply LabelEncoder to each categorical column for both train and test
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    train_df[col] = label_encoders[col].fit_transform(train_df[col])
    test_df[col] = test_df[col].apply(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)

# Function to convert running values from 'km' to 'miles'
def convert_running(value):
    if 'km' in value:
        kilometers = int(value.split()[0])
        miles = kilometers * 0.621371
        return f"{miles:.2f} miles"
    elif 'miles' in value:
        return value
    else:
        return value 

# Apply the conversion for both train and test datasets
train_df['running'] = train_df['running'].apply(convert_running)
test_df['running'] = test_df['running'].apply(convert_running)

# Clean and convert 'running' column to numeric values
train_df['running'] = train_df['running'].str.replace('miles', '').str.strip()
test_df['running'] = test_df['running'].str.replace('miles', '').str.strip()
train_df['running'] = pd.to_numeric(train_df['running'], errors='coerce').fillna(0).astype('int64')
test_df['running'] = pd.to_numeric(test_df['running'], errors='coerce').fillna(0).astype('int64')

# Drop duplicates from the training data
train_df = train_df.drop_duplicates()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   object 
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null   object 
 3   running       1642 non-null   object 
 4   wheel         1642 non-null   object 
 5   color         1642 non-null   object 
 6   type          1642 non-null   object 
 7   status        1642 non-null   object 
 8   motor_volume  1642 non-null   float64
 9   price         1642 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 128.4+ KB


In [7]:

# Prepare features and target variable
X = train_df.drop(['price', 'wheel'], axis = 1) # Replace 'target_column' with your actual target column name
y = train_df['price'] # Your target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Create a DataFrame for the predictions
# df_predictions = pd.DataFrame(y_predict, columns=['price'])

# # Display predictions
# print(df_predictions)

# # Save predictions to a CSV file if needed
# df_predictions.to_csv('predictions.csv', index=False)


In [9]:

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_df.drop(columns=['Id', 'wheel']))  # Adjust as necessary

# Define the deep learning model with improved architecture
model = keras.Sequential([
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001), input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.4),
    layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.Dropout(0.4),
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.Dropout(0.4),
    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.Dense(1)  # Output layer for regression
])



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# Compile the model with a learning rate scheduler
optimizer = keras.optimizers.Adam(learning_rate=0.0005)  # Adjusted learning rate
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


In [15]:

# Fit the model with early stopping
early_stopping = keras.callbacks.EarlyStopping(monitor='val_mae', patience=10, restore_best_weights=True)



In [17]:
history = model.fit(X_train_scaled, y_train, validation_data=(X_val_scaled, y_val), 
                    epochs=300, batch_size=32, callbacks=[early_stopping], verbose=1)



Epoch 1/300
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 15917.8662 - mae: 15917.2627 - val_loss: 15899.6240 - val_mae: 15899.0127
Epoch 2/300
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 16178.4189 - mae: 16177.7764 - val_loss: 15004.9951 - val_mae: 15004.1572
Epoch 3/300
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 13940.9199 - mae: 13939.9424 - val_loss: 7153.0664 - val_mae: 7151.5703
Epoch 4/300
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 5209.1235 - mae: 5207.5068 - val_loss: 3610.5884 - val_mae: 3608.9514
Epoch 5/300
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3886.7302 - mae: 3885.0906 - val_loss: 3411.1809 - val_mae: 3409.5505
Epoch 6/300
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3951.3967 - mae: 3949.7544 - val_loss: 3230.2603 - val_mae: 3228.6216
Epoch 7/300

In [19]:
# Evaluate the model on the validation set
val_loss, val_mae = model.evaluate(X_val_scaled, y_val)
print(f'Validation Mean Absolute Error: {val_mae:.2f}')



[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2944.5681 - mae: 2942.9663
Validation Mean Absolute Error: 2851.05


In [22]:
# Make predictions on the test set
y_predict = model.predict(X_test_scaled)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [28]:
# Create a DataFrame for the predictions
df_predictions = pd.DataFrame(y_predict, columns=['price'])

# Display predictions
print(df_predictions)

            price
0    18939.982422
1    16674.611328
2    18539.238281
3    15263.683594
4     7102.289062
..            ...
406  22212.279297
407  12685.937500
408   9902.892578
409  17383.431641
410  12656.580078

[411 rows x 1 columns]


In [30]:
combined_df = pd.concat([test_df['Id'], df_predictions], axis=1)

combined_df

Unnamed: 0,Id,price
0,0,18939.982422
1,1,16674.611328
2,2,18539.238281
3,3,15263.683594
4,4,7102.289062
...,...,...
406,406,22212.279297
407,407,12685.937500
408,408,9902.892578
409,409,17383.431641


In [32]:
combined_df.to_csv('zab.csv', index=False)