In [5]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import warnings
import matplotlib.pyplot as plt 
import seaborn as sns 
import math


# Data exploration 
data.sort_values('Year', ascending=True)


# exploring data
data.describe()

print(data.info())
print('\n')
print('Need to convert objects into categorical variables')

print(data.isna().any())
print('no missing data')


pd.Series(data['Year'].unique()).sort_values(ascending = True)

# Grouping Sales Volume per model, per year. Will ignore geography for now
modelsales_volume = data.groupby(['Model','Year'], as_index=False)['Sales_Volume'].sum().sort_values(by=['Model','Year'])
print(modelsales_volume.head())
print(modelsales_volume.shape)


# Number of unique models
models = modelsales_volume['Model'].unique()
n_models = len(models)

# Define grid size (e.g., 3 rows × 4 cols for 11 models)
n_cols = 4
n_rows = math.ceil(n_models / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 10), sharex=True, sharey=True)
axes = axes.flatten()  # Flatten axes array for easy indexing

for i, model_name in enumerate(models):
    ax = axes[i]
    group = modelsales_volume[modelsales_volume['Model'] == model_name].sort_values('Year')
    ax.plot(group['Year'], group['Sales_Volume'] / 1_000_000, marker='o', color='tab:blue')
    ax.set_title(model_name)
    ax.grid(True, linestyle='--', alpha=0.5)

# Label only outer plots
for ax in axes:
    ax.label_outer()

# Common labels
fig.text(0.5, 0.04, 'Year', ha='center', fontsize=12)
fig.text(0.04, 0.5, 'Sales Volume (Millions)', va='center', rotation='vertical', fontsize=12)

fig.suptitle('Sales Volume Trend by Model', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

print('\n')
# Initalize figure 
fig, ax = plt.subplots(figsize = (10,6))

for model_name, group in modelsales_volume.groupby('Model'):
    ax.plot(group['Year'], group['Sales_Volume'], marker='o', label=model_name)

ax.set_xlabel('Year')
ax.set_ylabel('Sales Volume')
ax.set_title('Sales Volume Trend by Model')
ax.legend(title='Model')
plt.show()


# Number of unique models
models = regionsales_volume['Region'].unique()
n_models = len(models)

# Define grid size (e.g., 3 rows × 4 cols for 11 models)
n_cols = 3
n_rows = math.ceil(n_models / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 10), sharex=True, sharey=True)
axes = axes.flatten()  # Flatten axes array for easy indexing

for i, model_name in enumerate(models):
    ax = axes[i]
    group = regionsales_volume[regionsales_volume['Region'] == model_name].sort_values('Year')
    ax.plot(group['Year'], group['Sales_Volume'] / 1_000_000, marker='o', color='tab:blue')
    ax.set_title(model_name)
    ax.grid(True, linestyle='--', alpha=0.5)

# Label only outer plots
for ax in axes:
    ax.label_outer()

# Common labels
fig.text(0.5, 0.04, 'Year', ha='center', fontsize=12)
fig.text(0.04, 0.5, 'Sales Volume (Millions)', va='center', rotation='vertical', fontsize=12)

fig.suptitle('Sales Volume Trend by Region', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

print('\n')
# Initalize figure 
fig, ax = plt.subplots(figsize = (10,6))

for model_name, group in regionsales_volume.groupby('Region'):
    ax.plot(group['Year'], group['Sales_Volume'], marker='o', label=model_name)

ax.set_xlabel('Year')
ax.set_ylabel('Sales Volume')
ax.set_title('Sales Volume Trend by Region')
ax.legend(title='Region')
plt.show()


#  Although Cyclical, I see a growth pattern in North America, with peaks, pushing to new volumes. This is expected to the sensitivity of car sales when compared to economic conditions, interest rates, and etc.


# Creating classification model that will predict whether a model will be classified as high or low in Sales, based on Features


# Step 1: Convert Datatypes that are objects into ML Friendly format 

print(list(data.select_dtypes(include='object').columns))
print('\n')
print(data.select_dtypes(include='object').head())

# Copying dataset to maintain source data
data2 = data.copy()
print('\n')

# Create and fit the label encoder
label_encoder = LabelEncoder()
data2['Region_coded'] = label_encoder.fit_transform(data2['Region'])

# Create a readable dictionary mapping
region_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Display the mapping
print("Region Encoding Dictionary:")
for region, code in region_mapping.items():
    print(f"{region}: {code}")

# Preview the updated DataFrame
data2.head()

from sklearn.preprocessing import LabelEncoder

# Create a fresh LabelEncoder for each categorical column
label_encoder_model = LabelEncoder()
label_encoder_color = LabelEncoder()
label_encoder_fuel = LabelEncoder()
label_encoder_trans = LabelEncoder()
label_encoder_sales = LabelEncoder()  # new encoder for sales classification

# Encode categorical columns and create mapping dictionaries
data2['Model_coded'] = label_encoder_model.fit_transform(data2['Model'])
model_mapping = dict(zip(label_encoder_model.classes_, label_encoder_model.transform(label_encoder_model.classes_)))

data2['Color_coded'] = label_encoder_color.fit_transform(data2['Color'])
color_mapping = dict(zip(label_encoder_color.classes_, label_encoder_color.transform(label_encoder_color.classes_)))

data2['Fuel_Type_coded'] = label_encoder_fuel.fit_transform(data2['Fuel_Type'])
fuel_type_mapping = dict(zip(label_encoder_fuel.classes_, label_encoder_fuel.transform(label_encoder_fuel.classes_)))

data2['Transmission_coded'] = label_encoder_trans.fit_transform(data2['Transmission'])
transmission_mapping = dict(zip(label_encoder_trans.classes_, label_encoder_trans.transform(label_encoder_trans.classes_)))

# Encode Sales_Classification (High/Low → 1/0)
data2['Sales_Classification_coded'] = label_encoder_sales.fit_transform(data2['Sales_Classification'])
sales_mapping = dict(zip(label_encoder_sales.classes_, label_encoder_sales.transform(label_encoder_sales.classes_)))

# Print mappings clearly
print("Model Encoding Dictionary:")
for model, code in model_mapping.items():
    print(f"{model}: {code}")

print("\nColor Encoding Dictionary:")
for color, code in color_mapping.items():
    print(f"{color}: {code}")

print("\nFuel Type Encoding Dictionary:")
for fuel, code in fuel_type_mapping.items():
    print(f"{fuel}: {code}")

print("\nTransmission Encoding Dictionary:")
for trans, code in transmission_mapping.items():
    print(f"{trans}: {code}")

print("\nSales Classification Encoding Dictionary:")
for sale_class, code in sales_mapping.items():
    print(f"{sale_class}: {code}")

# Preview the updated DataFrame
data2.head()

# Converting year to age of car
data2['Age'] = 2025 - data2['Year']
data2.head()


data_clean = data2.copy()
data_clean.drop(['Model','Year','Region','Color','Fuel_Type','Transmission', 'Sales_Classification'],axis=1, inplace=True)
data_clean.head()



warnings.filterwarnings('ignore')

# Splitting data into X and Y datasets
X= data_clean.drop(columns='Sales_Classification_coded')
y = data_clean['Sales_Classification_coded']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test datasets 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Hyperparameter grid to search over
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization types
    'solver': ['liblinear', 'saga'],  # Solvers
    'max_iter': [100, 200, 300]  # Number of iterations
}

log_model = LogisticRegression(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_log_model = grid_search.best_estimator_

# Make Predictions 
y_pred = best_log_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

print('\n')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Convert to a pandas DataFrame with labeled rows and columns
conf_matrix_df = pd.DataFrame(conf_matrix, index=['True', 'False'], columns=['Predicted True', 'Predicted False'])

print(conf_matrix_df)

print('\n')

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

NameError: name 'data' is not defined