In [17]:
import joblib
import os
import pandas
import numpy
import warnings

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

def predict_future_prices(df, future_years):
    try:
        model = joblib.load('../trained-data/decision-tree-trained-model.joblib')

        model_features = model.feature_names_in_

        numeric_features = df[model_features].select_dtypes(include=[numpy.number]).columns

        yearly_changes = df.groupby(['brand_encoded', 'model_encoded', 'year'])[numeric_features].mean().groupby(level=[0, 1]).diff().groupby(level=[0, 1]).mean()

        last_rows = df.groupby(['brand_encoded', 'model_encoded']).apply(lambda x: x[model_features].iloc[-1]).reset_index(drop=True)

        future_data_rows = []

        for (brand_code, model_code) in last_rows[['brand_encoded', 'model_encoded']].drop_duplicates().values:
            model_last_row = last_rows[(last_rows['brand_encoded'] == brand_code) & (last_rows['model_encoded'] == model_code)].copy()

            for i, future_year in enumerate(future_years):
                new_row = model_last_row.copy()
                new_row['future_year'] = future_year  

                for feature in numeric_features:
                    if feature != 'year':
                        new_row[feature] += yearly_changes.loc[(brand_code, model_code), feature] * (i + 1)

                future_data_rows.append(new_row)

        future_data = pandas.concat(future_data_rows, ignore_index=True)

        imputer = SimpleImputer(strategy='mean')

        future_data_imputed = pandas.DataFrame(imputer.fit_transform(future_data), columns=future_data.columns)

        all_features = list(model_features) + ['future_year']

        future_data_imputed = future_data_imputed[all_features]

        future_prices = model.predict(future_data_imputed[model_features])

        future_data_imputed['predicted_price'] = future_prices.round().astype(int)

        return future_data_imputed[['brand_encoded', 'model_encoded', 'year', 'future_year', 'predicted_price']]
    
    except Exception as e:
        print(f"Error predicting future prices: {e}")
        return None

def main():

    try:
        file_path = os.path.join('../sample-data/dataset.csv')

        dataset = pandas.read_csv(file_path)
        
    except FileNotFoundError:
        dataset = None
        print(f"Error: The file at {file_path} was not found.")
        
    except pandas.errors.EmptyDataError:
        dataset = None
        print("Error: The file is empty.")
        
    except pandas.errors.ParserError:
        dataset = None
        print("Error: The file could not be parsed.")

    if dataset is not None:     
        try:
            label_encoder = LabelEncoder()

            dataset['brand_encoded'] = label_encoder.fit_transform(dataset['brand'])
            dataset['model_encoded'] = label_encoder.fit_transform(dataset['model'])
            dataset['transmission_encoded'] = label_encoder.fit_transform(dataset['transmission'])
            dataset['fuelType_encoded'] = label_encoder.fit_transform(dataset['fuelType'])

        except Exception as e:
            print(f"An unexpected error occurred during preprocessing: {e}")

        future_years = range(2025, 2035)

        predictions = predict_future_prices(dataset, future_years)

        if predictions is not None:

            predictions['brand_encoded'] = predictions['brand_encoded'].astype(int)
            predictions['model_encoded'] = predictions['model_encoded'].astype(int)
            
            brand_mapping = dict(zip(dataset['brand_encoded'].dropna().unique(), dataset['brand'].dropna().unique()))
            model_mapping = dict(zip(dataset['model_encoded'].dropna().unique(), dataset['model'].dropna().unique()))

            predictions['brand'] = predictions['brand_encoded'].map(brand_mapping).fillna('Unknown Brand')
            predictions['model'] = predictions['model_encoded'].map(model_mapping).fillna('Unknown Model')

            predicted_df = predictions[['brand', 'model', 'year', 'future_year', 'predicted_price']]
            predicted_df['predicted_price'] = predicted_df['predicted_price'].round().astype(int)
            predicted_df['future_year'] = predicted_df['future_year'].astype(int)
            predicted_df['year'] = predicted_df['year'].astype(int)

            # display(pandas.DataFrame(predicted_df))

            predicted_df.to_csv('../predictions/decision-tree-model-training.csv', index=False)

            print("success")

if __name__ == "__main__":
    main()

  last_rows = df.groupby(['brand_encoded', 'model_encoded']).apply(lambda x: x[model_features].iloc[-1]).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted Price'] = predicted_df['Predicted Price'].round().astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['future_year'] = predicted_df['future_year'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.or

Unnamed: 0,brand,model,year,future_year,Predicted Price
0,Audi,A1,2014,2025,9995
1,Audi,A1,2014,2026,9790
2,Audi,A1,2014,2027,9290
3,Audi,A1,2014,2028,9290
4,Audi,A1,2014,2029,9290
...,...,...,...,...,...
1935,Volkswagen,Up,2014,2030,7495
1936,Volkswagen,Up,2014,2031,7495
1937,Volkswagen,Up,2014,2032,7495
1938,Volkswagen,Up,2014,2033,7495
