In [78]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np

In [80]:
# Set the working directory
working_directory = os.getcwd()
print(f"Current working directory: {working_directory}")


Current working directory: /Users/buboyencarnacion


In [82]:
# Load cleaned datasets
customers_df = pd.read_csv(os.path.join(working_directory, 'cleaned_customers_data.csv'))
products_df = pd.read_csv(os.path.join(working_directory, 'cleaned_products_data.csv'))
transactions_df = pd.read_csv(os.path.join(working_directory, 'cleaned_transactions_data.csv'))


In [84]:
# Convert Product_ID to string in both datasets
products_df['Product_ID'] = products_df['Product_ID'].astype(str)
transactions_df['Product_ID'] = transactions_df['Product_ID'].astype(str)

In [86]:
# Convert Company_ID to string in both datasets
customers_df['Company_ID'] = customers_df['Company_ID'].astype(str)
transactions_df['Company_ID'] = transactions_df['Company_ID'].astype(str)


In [88]:
# Ensure no missing values in key columns
assert not transactions_df[['Product_ID', 'Company_ID']].isnull().any().any()
assert not customers_df[['Company_ID']].isnull().any().any()
assert not products_df[['Product_ID']].isnull().any().any()


In [90]:
# Step 1: Merge Transactions with Products on 'Product_ID'
merged_df = transactions_df.merge(products_df, on='Product_ID')

# Step 2: Merge the result with Customers on 'Company_ID'
merged_df = merged_df.merge(customers_df, on='Company_ID')

In [92]:
# Drop unnecessary columns (optional)
merged_df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

In [94]:
# Check for missing values
missing_values = merged_df.isnull().sum()
print("\nMissing Values in Merged DataFrame:")
print(missing_values[missing_values > 0])


Missing Values in Merged DataFrame:
Transaction_Date    6199
dtype: int64


In [96]:
### Merged Dataset Preprocessing ###

# Fill missing values
numerical_columns = merged_df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_columns:
    merged_df[col].fillna(merged_df[col].median(), inplace=True)

categorical_columns = merged_df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    merged_df[col].fillna(merged_df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df[col].fillna(merged_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df[col].fillna(merged_df[col].mode()[0], inplace=True)


In [98]:
# Label Encoding Categorical Features
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    merged_df[col] = le.fit_transform(merged_df[col])
    label_encoders[col] = le

In [100]:
# Scaling Numerical Features
scaler = StandardScaler()
numerical_features = ['Product_Price', 'Quantity', 'Total_Cost', 'Company_Profit']
numerical_features = [col for col in numerical_features if col in merged_df.columns]

if numerical_features:
    merged_df[numerical_features] = scaler.fit_transform(merged_df[numerical_features])

In [102]:
### Define Features and Target ###

# Define X (features) and y (target)
X = merged_df.drop(columns=['Product_Name', 'Transaction_Date'])
y = merged_df['Product_Name']


In [104]:
# Ensure all features are numeric
print("\nData types of features:")
print(X.dtypes)



Data types of features:
Transaction_ID            float64
Company_ID                  int64
Product_ID                  int64
Quantity                  float64
Product_Price_x           float64
Total_Cost                float64
Recency                   float64
Purchase_Frequency        float64
Total_Spending            float64
Product_Price_y           float64
Price_Range                 int64
Company_Name                int64
Company_Profit            float64
Address                     int64
Profitability_Category      int64
Region                      int64
dtype: object


In [106]:
### Split Data ###
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [108]:
### Model 1: Multinomial Logistic Regression ###
logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
logistic_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [110]:
# Predictions
logistic_predictions = logistic_model.predict(X_test)


In [112]:
# Evaluation Metrics
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_report = classification_report(y_test, logistic_predictions, output_dict=True)

print("\nMultinomial Logistic Regression Results:")
print(f"Accuracy: {logistic_accuracy}")
print(classification_report(y_test, logistic_predictions))


Multinomial Logistic Regression Results:
Accuracy: 0.3263094521372667
              precision    recall  f1-score   support

           0       0.26      0.27      0.27        67
           1       0.32      0.19      0.24        97
           2       0.12      0.14      0.13        69
           3       0.12      0.04      0.06        78
           4       0.00      0.00      0.00        86
           5       0.29      0.41      0.34        85
           6       0.62      0.79      0.69        80
           7       0.44      0.34      0.38        83
           8       0.25      0.17      0.20        78
           9       0.14      0.06      0.08        86
          10       0.25      0.04      0.07        72
          11       0.24      0.40      0.30        91
          12       0.22      0.40      0.29        72
          13       0.14      0.05      0.08        92
          14       0.20      0.46      0.28        81
          15       0.00      0.00      0.00        90
          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [114]:
### Model 2: Random Forest Classifier ###
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)

In [116]:
# Predictions
rf_predictions = random_forest_model.predict(X_test)

In [118]:
# Evaluation Metrics
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_report = classification_report(y_test, rf_predictions, output_dict=True)

print("\nRandom Forest Classifier Results:")
print(f"Accuracy: {rf_accuracy}")
print(classification_report(y_test, rf_predictions))



Random Forest Classifier Results:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        67
           1       1.00      1.00      1.00        97
           2       1.00      1.00      1.00        69
           3       1.00      1.00      1.00        78
           4       1.00      1.00      1.00        86
           5       1.00      1.00      1.00        85
           6       1.00      1.00      1.00        80
           7       1.00      1.00      1.00        83
           8       1.00      1.00      1.00        78
           9       1.00      1.00      1.00        86
          10       1.00      1.00      1.00        72
          11       1.00      1.00      1.00        91
          12       1.00      1.00      1.00        72
          13       1.00      1.00      1.00        92
          14       1.00      1.00      1.00        81
          15       1.00      1.00      1.00        90
          16       1.00      1.0

In [120]:
### Model Comparison Matrix ###
comparison_table = pd.DataFrame({
    "Model": ["Multinomial Logistic Regression", "Random Forest Classifier"],
    "Accuracy": [logistic_accuracy, rf_accuracy],
    "Precision (Weighted Avg)": [
        logistic_report["weighted avg"]["precision"],
        rf_report["weighted avg"]["precision"]
    ],
    "Recall (Weighted Avg)": [
        logistic_report["weighted avg"]["recall"],
        rf_report["weighted avg"]["recall"]
    ],
    "F1-Score (Weighted Avg)": [
        logistic_report["weighted avg"]["f1-score"],
        rf_report["weighted avg"]["f1-score"]
    ]
})

In [122]:
# Save comparison table to CSV
comparison_table.to_csv("model_comparison.csv", index=False)
print("Model Comparison Table:")
print(comparison_table)

Model Comparison Table:
                             Model  Accuracy  Precision (Weighted Avg)  \
0  Multinomial Logistic Regression  0.326309                  0.263638   
1         Random Forest Classifier  1.000000                  1.000000   

   Recall (Weighted Avg)  F1-Score (Weighted Avg)  
0               0.326309                 0.272098  
1               1.000000                 1.000000  


In [124]:
### Predict on the Same Sample Data ###
sample_data = X_test.iloc[:12]  # Predicting 12 datapoints

logistic_predictions = logistic_model.predict(sample_data)
rf_predictions = random_forest_model.predict(sample_data)

In [126]:
# Get original label classes
product_classes = label_encoders["Product_Name"].classes_

In [128]:
# Function to safely decode labels
def safe_inverse_transform(predictions, encoder_classes):
    return np.array([
        encoder_classes[pred] if pred < len(encoder_classes) else "Unknown"
        for pred in predictions
    ])

In [132]:
# Convert predictions back to original labels
logistic_predictions_original = safe_inverse_transform(logistic_predictions, product_classes)
rf_predictions_original = safe_inverse_transform(rf_predictions, product_classes)
actual_original = safe_inverse_transform(y_test.loc[sample_data.index].values, product_classes)

In [134]:
# Combine predictions into a single DataFrame
comparison_df = pd.DataFrame({
    'Sample Index': sample_data.index,
    'Actual': actual_original,
    'Logistic Regression Prediction': logistic_predictions_original,
    'Random Forest Prediction': rf_predictions_original
})

In [136]:
# Display the comparison table
print("\nComparison of Predictions Side by Side:")
print(comparison_df)


Comparison of Predictions Side by Side:
    Sample Index                        Actual Logistic Regression Prediction  \
0           5758        CustomerScope Insights         DashSync Analytics Hub   
1           4987            NexGen Segmentator             SegmentX Targeting   
2           4322           SalesSync Optimizer               BudgetMaster Pro   
3           2821            SegmentX Targeting             SegmentX Targeting   
4           4696           ForecastXcelerator    FinSphere Intelligence Suite   
5           1858            SegmentX Targeting             SegmentX Targeting   
6           1820           ForecastXcelerator    FinSphere Intelligence Suite   
7           7877  InsightMax Insights Platform         DashSync Analytics Hub   
8           6676           OptiFlow Automation            OptiFlow Automation   
9           7070            NexGen Segmentator             SegmentX Targeting   
10           982                     Product 9                      

In [138]:
### Predict on New Sample Input ###
company_name = "Sky Industries"
if company_name in label_encoders["Company_Name"].classes_:
    encoded_company_name = label_encoders["Company_Name"].transform([company_name])
else:
    print(f"Warning: '{company_name}' not found in training data. Using default encoding.")
    encoded_company_name = [-1]  # or any other default value

sample_input = pd.DataFrame({
    "Quantity": [2],
    "Total_Cost": [6000],
    "Company_Name": encoded_company_name,
    "Product_Price_x": [12000],  # Ensure the feature name matches the training data
    "Company_Profit": [100000],
    # Add any other features that were used during training
})



In [140]:
# Fill missing features with default values
for col in X.columns:
    if col not in sample_input.columns:
        sample_input[col] = 0  # or any other default value


In [142]:
# Ensure the order of columns matches the training data
sample_input = sample_input[X.columns]


In [144]:
# Predict using Logistic Regression
logistic_sample_prediction = logistic_model.predict(sample_input)
logistic_sample_prediction_original = label_encoders["Product_Name"].inverse_transform(logistic_sample_prediction)

In [150]:
# Predict using Random Forest
rf_sample_prediction = random_forest_model.predict(sample_input)
rf_sample_prediction_original = label_encoders["Product_Name"].inverse_transform(rf_sample_prediction)

print("\nPredicted Product (Logistic Regression):", logistic_sample_prediction_original)
print("Predicted Product (Random Forest):", rf_sample_prediction_original)


Predicted Product (Logistic Regression): ['SegmentX Targeting']
Predicted Product (Random Forest): ['SegmentX Targeting']


In [152]:
### Explanation for Choosing Multinomial Logistic Regression ###
"""
Multinomial Logistic Regression is a good choice for this problem because:

1. Multiclass Classification: It is designed for multiclass classification problems, where the target variable can belong to one of three or more classes. In your case, predicting the product name fits this scenario.

2. Probabilistic Interpretation: It provides probabilities for each class, which can be useful for understanding the confidence of the predictions.

3. Linear Decision Boundaries: It works well when the relationship between the features and the target variable is approximately linear. This can be a good starting point before trying more complex models.

4. Baseline Model: It serves as a strong baseline model. If it performs well, it indicates that the problem can be solved with simpler models. If not, it provides a benchmark to compare more complex models against.
"""

'\nMultinomial Logistic Regression is a good choice for this problem because:\n\n1. Multiclass Classification: It is designed for multiclass classification problems, where the target variable can belong to one of three or more classes. In your case, predicting the product name fits this scenario.\n\n2. Probabilistic Interpretation: It provides probabilities for each class, which can be useful for understanding the confidence of the predictions.\n\n3. Linear Decision Boundaries: It works well when the relationship between the features and the target variable is approximately linear. This can be a good starting point before trying more complex models.\n\n4. Baseline Model: It serves as a strong baseline model. If it performs well, it indicates that the problem can be solved with simpler models. If not, it provides a benchmark to compare more complex models against.\n'