In [33]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load train and test datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Checking information of the train data
train_df.info()

# Initialize LabelEncoder and dictionary to store encoders for each column
label_encoders = {}

# List of categorical columns to encode
categorical_columns = ['model', 'motor_type', 'wheel', 'color', 'status', 'type']

# Apply LabelEncoder to each categorical column for both train and test
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()  # Create a LabelEncoder for each column
    
    # Fit the encoder on the train data
    train_df[col] = label_encoders[col].fit_transform(train_df[col])
    
    # Transform the test data, replacing unseen labels with -1
    test_df[col] = test_df[col].apply(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)

# Function to convert running values from 'km' to 'miles'
def convert_running(value):
    if 'km' in value:
        kilometers = int(value.split()[0])  
        miles = kilometers * 0.621371        
        return f"{miles:.2f} miles"          
    elif 'miles' in value:
        return value  
    else:
        return value 

# Apply the same running conversion for both train and test datasets
train_df['running'] = train_df['running'].apply(convert_running)
test_df['running'] = test_df['running'].apply(convert_running)

# Remove the 'miles' string and convert 'running' column to numeric values in both train and test
train_df['running'] = train_df['running'].str.replace('miles', '').str.strip()
test_df['running'] = test_df['running'].str.replace('miles', '').str.strip()

# Convert 'running' column to numeric, handle missing values, and convert to integer type for both train and test
train_df['running'] = pd.to_numeric(train_df['running'], errors='coerce').fillna(0).astype('int64')
test_df['running'] = pd.to_numeric(test_df['running'], errors='coerce').fillna(0).astype('int64')

# Checking information of both train and test data after processing
train_df.info()
test_df.info()

# Now train_df and test_df are ready for modeling


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   object 
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null   object 
 3   running       1642 non-null   object 
 4   wheel         1642 non-null   object 
 5   color         1642 non-null   object 
 6   type          1642 non-null   object 
 7   status        1642 non-null   object 
 8   motor_volume  1642 non-null   float64
 9   price         1642 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 128.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   int32  
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null  

In [29]:
train_df.duplicated().sum()
train_df = train_df.drop_duplicates()

In [35]:
train_df.corr()['price']

model           0.173485
year            0.638699
motor_type      0.238964
running        -0.513134
wheel                NaN
color          -0.036846
type            0.037789
status         -0.282471
motor_volume    0.000885
price           1.000000
Name: price, dtype: float64

In [37]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   int32  
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null   int32  
 3   running       1642 non-null   int64  
 4   wheel         1642 non-null   int32  
 5   color         1642 non-null   int32  
 6   type          1642 non-null   int32  
 7   status        1642 non-null   int32  
 8   motor_volume  1642 non-null   float64
 9   price         1642 non-null   int64  
dtypes: float64(1), int32(6), int64(3)
memory usage: 89.9 KB


In [39]:
train_df

Unnamed: 0,model,year,motor_type,running,wheel,color,type,status,motor_volume,price
0,4,2022,3,1864,0,15,5,1,2.0,24500
1,2,2014,3,82020,0,1,5,1,2.0,25500
2,1,2018,3,95000,0,10,5,1,2.0,11700
3,2,2002,3,137000,0,6,5,1,3.2,12000
4,2,2017,3,80778,0,1,5,2,2.0,26000
...,...,...,...,...,...,...,...,...,...,...
1637,0,2017,3,120000,0,16,5,2,2.0,12400
1638,4,2014,3,105633,0,1,5,2,2.0,16500
1639,3,2018,3,68900,0,2,6,2,2.0,19500
1640,3,2019,3,31000,0,1,6,1,2.0,19500


In [41]:
x_train = train_df.drop(['price', 'wheel'], axis = 1)
y_train = train_df['price']
x_train.shape, y_train.shape

((1642, 8), (1642,))

In [43]:
test = test_df 

In [45]:
test = test.drop(['Id', 'wheel'], axis = 1)

In [47]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Assuming x_train and x_test are your features (excluding the target column if any)
scalar = StandardScaler()

# Fit the scaler on the training data and transform it
x_train_scaled = scalar.fit_transform(x_train)

# Transform the test data using the same scaler (without fitting again)
x_test_scaled = scalar.transform(test)

# Now both x_train_scaled and x_test_scaled are scaled consistently


In [50]:
import warnings


warnings.filterwarnings('ignore')

In [52]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

# Updated parameter grid
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],  # Start with a larger base
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],  # More granular learning rates
    'max_depth': [3, 4, 5, 6, 7, 8],  # Reduced depth range
    'min_samples_split': [2, 5, 10, 15, 16],
    'min_samples_leaf': [1, 2, 3, 4, 5],  # Reduced to avoid overfitting
    'subsample': [0.5, 0.75, 1.0, 1.25, 1.5],
    'max_features': ['sqrt', 'log2', None],  # Focus on a few options
    'loss': ['ls', 'huber',  'quantile'],  # Remove 'lad' to reduce complexity
    'alpha': [0.75, 0.9, 1.5, 1.9]  # Only used if loss='huber'
}

# Initialize the GradientBoostingRegressor with early stopping
gbr = GradientBoostingRegressor(random_state=42, n_iter_no_change=10, tol=0.0001)

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gbr,
    param_distributions=param_distributions,
    n_iter=100,
    scoring='neg_mean_absolute_error',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    return_train_score=True  # Optional: track training score
)

# Fit the model on standardized features
random_search.fit(x_train_scaled, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f'Best parameters: {best_params}')

# Get the best model
best_model = random_search.best_estimator_
print(best_model)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'subsample': 0.75, 'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 8, 'loss': 'huber', 'learning_rate': 0.05, 'alpha': 0.75}
GradientBoostingRegressor(alpha=0.75, learning_rate=0.05, loss='huber',
                          max_depth=8, max_features='sqrt', min_samples_leaf=5,
                          n_estimators=300, n_iter_no_change=10,
                          random_state=42, subsample=0.75)


In [58]:
y_pred = best_model.predict(x_train_scaled)
mae = mean_absolute_error(y_train, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')
# Mean Absolute Error: 1573.91
# Mean Absolute Error: 1409.60

Mean Absolute Error: 1472.22


In [60]:
gbr = GradientBoostingRegressor(alpha=0.75, learning_rate=0.05, loss='huber',
                          max_depth=8, max_features='sqrt', min_samples_leaf=5,
                          n_estimators=300, n_iter_no_change=10,
                          random_state=42, subsample=0.75)
gbr.fit(x_train_scaled, y_train)

In [62]:
from sklearn.metrics import  mean_squared_error, mean_absolute_error, r2_score

y_pred = gbr.predict(x_train_scaled)
mse = mean_squared_error(y_train, y_pred)
print(f'Mean Squared Error with Gradient Boosting: {mse:.2f}')
mae = mean_absolute_error(y_train, y_pred)
print(mae)
r2 = r2_score(y_train, y_pred)
print(r2)


# Mean Squared Error with Gradient Boosting: 8957325.64
# 1573.9081696891762
# 0.8259524842024936
# Mean Squared Error with Gradient Boosting: 8957325.64
# 1573.9081696891762
# 0.8259524842024936
# Mean Squared Error with Gradient Boosting: 8473795.34
# 1498.3046913104029
# 0.8360547916628863
# Mean Squared Error with Gradient Boosting: 8496116.59
# 1409.595354194299
# 0.8349141198363884

Mean Squared Error with Gradient Boosting: 8834404.49
1472.222566395433
0.8283409336944179


In [66]:
y_predict = gbr.predict(x_test_scaled)
y_predict
df_train = pd.DataFrame(y_predict, columns=['price'])
df_train

Unnamed: 0,price
0,17719.221116
1,16897.792665
2,24714.099552
3,14986.614726
4,6483.975124
...,...
406,26098.116023
407,14053.868751
408,11964.401589
409,15095.951476


In [68]:
combined_df = pd.concat([test_df['Id'], df_train], axis=1)

combined_df

Unnamed: 0,Id,price
0,0,17719.221116
1,1,16897.792665
2,2,24714.099552
3,3,14986.614726
4,4,6483.975124
...,...,...
406,406,26098.116023
407,407,14053.868751
408,408,11964.401589
409,409,15095.951476


In [70]:
combined_df.to_csv('Ranju_car.csv', index=False)

In [94]:
import pickle

In [96]:
with open('model.pkl', 'wb') as file:
    pickle.dump(gbr, file)

In [None]:
import pickle

with open('LinearRegressionModel.pkl', 'rb') as file:
    loaded_model = pickle.load(file)