In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd


# Correctly loading all datasets again in this session
green_non_organic_cabbage = pd.read_csv('Datasets/Green_Non_Organic_Cabbage_Merged.csv')
red_onion = pd.read_csv('Datasets/Integrated_Red_Onion_Data.csv')
white_onion = pd.read_csv('Datasets/Integrated_White_Onion_Data.csv')
yellow_onion = pd.read_csv('Datasets/Integrated_Yellow_Onion_Data.csv')
red_cabbage = pd.read_csv('Datasets/Red_Cabbage_Merged.csv')

# Getting statistical summary of the datasets
datasets_summary = {
    "Green Non-Organic Cabbage": green_non_organic_cabbage.describe(),
    "Red Onion": red_onion.describe(),
    "White Onion": white_onion.describe(),
    "Yellow Onion": yellow_onion.describe(),
    "Red Cabbage": red_cabbage.describe()
}


# List of datasets
datasets = [green_non_organic_cabbage, red_onion, white_onion, yellow_onion, red_cabbage]

# Columns to drop
columns_to_drop = ['Organic', 'Environment', 'Region', 'Class']

# Dropping the columns from each dataset
for dataset in datasets:
    dataset.drop(columns=[col for col in columns_to_drop if col in dataset.columns], inplace=True)

# Combining the datasets
combined_data = pd.concat(datasets, ignore_index=True)

green_non_organic_cabbage['Commodity'] = green_non_organic_cabbage['Commodity'].fillna('Green Non-Organic Cabbage')
red_onion['Commodity'] = red_onion['Commodity'].fillna('Red Onion')
white_onion['Commodity'] = white_onion['Commodity'].fillna('White Onion')
yellow_onion['Commodity'] = yellow_onion['Commodity'].fillna('Yellow Onion')
red_cabbage['Commodity'] = red_cabbage['Commodity'].fillna('Red Cabbage')

# Combining all the dataframes
combined_data = pd.concat([
    green_non_organic_cabbage,
    red_onion,
    white_onion,
    yellow_onion,
    red_cabbage
], ignore_index=True)

# Convert 'Date' to datetime, automatically inferring the datetime format
combined_data['Date'] = pd.to_datetime(combined_data['Date'], errors='coerce', infer_datetime_format=True)

# After conversion, check if there are any NaT (Not a Time) entries, which indicate failed conversions
print("Number of NaT entries:", combined_data['Date'].isna().sum())

# Extract day, month, and year from 'Date'
combined_data['Day'] = combined_data['Date'].dt.day
combined_data['Month'] = combined_data['Date'].dt.month
combined_data['Year'] = combined_data['Date'].dt.year

# Create rolling averages for temperature and precipitation
combined_data.sort_values(by='Date', inplace=True)  # Ensure data is sorted by date for rolling calculations
combined_data['RollingAvgTempC'] = combined_data.groupby('Commodity')['AvgTempC'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
combined_data['RollingAvgPrecip'] = combined_data.groupby('Commodity')['AvgPrecipitation'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

# One-hot encode categorical variables 'Commodity' and 'Variety'
combined_data = pd.get_dummies(combined_data, columns=['Commodity', 'Variety'], drop_first=True)



Number of NaT entries: 1705


  combined_data['Date'] = pd.to_datetime(combined_data['Date'], errors='coerce', infer_datetime_format=True)


In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Load and prepare your data
# Assuming 'combined_data' is ready and has been preprocessed (without NaN handling or scaling)
X = combined_data.drop(['Weighted Avg Price', 'Date', 'Unit', 'Number of Stores'], axis=1)
y = combined_data['Weighted Avg Price']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that includes data scaling, imputation, and regression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler()),  # Important for SVR
    ('svr', SVR(kernel='rbf', C=100, gamma='scale', epsilon=0.1))
])

# Fit the model
pipeline.fit(X_train, y_train)

# Making predictions
predictions = pipeline.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


param_grid = {
    'svr__C': [0.1, 1, 10, 100],
    'svr__gamma': ['scale', 'auto'],
    'svr__epsilon': [0.01, 0.1, 1],
    'svr__kernel': ['rbf', 'linear']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validated R² score:", grid_search.best_score_)

# Evaluate on test data
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error on Test Set: {mse}")
print(f"R^2 Score on Test Set: {r2}")


Mean Squared Error: 0.5263216637832836
R^2 Score: 0.3735794902319748
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'svr__C': 1, 'svr__epsilon': 0.1, 'svr__gamma': 'scale', 'svr__kernel': 'rbf'}
Best cross-validated R² score: 0.4176953220562881
Mean Squared Error on Test Set: 0.4791354550031159
R^2 Score on Test Set: 0.4297398404361116
