In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv("AIML Exam 1 sales_data_50.csv") 

# Perform EDA
print(data.head())  # Display first 5 rows
print(data.describe())  # Summary statistics
print(data.info())  # Data types and missing values

# Check for missing values
print(data.isnull().sum())


         Date    Product Region  Quantity Sold  Unit Price  Total Revenue
0  01-01-2023  Product B  South             17        0.00            0.0
1  02-01-2023  Product D   East              0       10.41            0.0
2  03-01-2023  Product D  South              0       14.44            0.0
3  04-01-2023  Product A  North             12        0.00            0.0
4  05-01-2023  Product B   East              0        0.00            0.0
       Quantity Sold  Unit Price  Total Revenue
count      50.000000   50.000000      50.000000
mean       10.420000    8.332600      76.751200
std        11.475066    7.716063     150.427496
min         0.000000    0.000000       0.000000
25%         0.000000    0.000000       0.000000
50%         4.500000   11.415000       0.000000
75%        22.000000   14.652500       0.000000
max        30.000000   19.450000     525.300000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column         No

In [11]:
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Handle missing values
data.fillna(0, inplace=True)  # Filling missing values with 0 for simplicity

# Encode categorical 'Region'
le = LabelEncoder()
data['Region'] = le.fit_transform(data['Region'])

# Standardize numerical features
scaler = StandardScaler()
data[['Quantity Sold', 'Unit Price']] = scaler.fit_transform(data[['Quantity Sold', 'Unit Price']])


In [50]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Define features (X) and target (y)
X = data[['Quantity Sold', 'Unit Price', 'Region']]
y = data['Total Revenue']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor()
}

# Evaluate models using cross-validation
for name, model in models.items():
    cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
    print(f'{name} CV Score: {cv_score.mean()}')


Linear Regression CV Score: -79.17176269707366
Decision Tree CV Score: -31.852999999999998
Random Forest CV Score: -51.02142250000001


In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(f'Best Parameters: {grid_search.best_params_}')

# Evaluate optimized model
optimized_model = grid_search.best_estimator_
y_pred_opt = optimized_model.predict(X_test)

rmse_opt = np.sqrt(mean_squared_error(y_test, y_pred_opt))
mae_opt = mean_absolute_error(y_test, y_pred_opt)

print(f'Optimized RMSE: {rmse_opt}')
print(f'Optimized MAE: {mae_opt}')
