In [13]:
import sqlite3
import json
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Connect to the SQLite database
conn = sqlite3.connect('perfume.db')

# Load data into a DataFrame
query = "SELECT * FROM No_Reviews"
data = pd.read_sql_query(query, conn)

# Drop the 'main_accords_dirty' column
data = data.drop(columns=['main_accords_dirty'])

In [14]:
import ast

def extract_main_accords(json_str):
    try:
        accords = ast.literal_eval(json_str)
        if isinstance(accords, dict):
            return ', '.join(accords.get('base', []) + accords.get('top', []) + accords.get('middle', []))
    except (SyntaxError, ValueError):
        pass
    return ''

data['Maine_accords'] = data['Maine_accords'].apply(extract_main_accords)

In [15]:
# Identify the data types of each column
data_types = data.dtypes

# Separate the column names based on their data types
categorical_cols = data_types[data_types == 'object'].index.tolist()
numerical_cols = data_types[data_types != 'object'].index.tolist()

# Print the lists of categorical and numerical columns
print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

Categorical Columns: ['brand', 'perfume', 'notes', 'longevity', 'sillage', 'Maine_accords']
Numerical Columns: ['launch_year']


In [16]:
# Check the data types
print(data.info())

# Inspect the data in the 'perfume' column
print(data['perfume'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37925 entries, 0 to 37924
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   brand          37925 non-null  object 
 1   perfume        37922 non-null  object 
 2   launch_year    26715 non-null  float64
 3   notes          36969 non-null  object 
 4   longevity      37925 non-null  object 
 5   sillage        37925 non-null  object 
 6   Maine_accords  37925 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.0+ MB
None
['Rose de Grasse d Or' 'CK All' 'Clean For Men Black Leather' ...
 'Island Blossom' 'Lemongrass Blend' 'Vanilla Blend']


In [18]:
# Define numerical transformer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine numerical and categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform transformers separately
X_cat = categorical_transformer.fit_transform(data[categorical_cols])
X_num = numerical_transformer.fit_transform(data[numerical_cols])

# Check for missing values
print(data.isnull().sum())

brand                0
perfume              3
launch_year      11210
notes              956
longevity            0
sillage              0
Maine_accords        0
dtype: int64


In [43]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample data
data = {
    'Maine_accords': [
        "woody, musky, leather, patchouli, rose, oud",
        "amber, citrus, white floral, animalic, balsamic"
    ],
    'Another_column': [
        "another, example, of, text",
        "more, text, data"
    ]
    # Add more columns as needed
}

# Convert the dictionary to a DataFrame
df = pd.DataFrame(data)

count_vectorizer = CountVectorizer(min_df=1) 

# Initialize an empty DataFrame to store the transformed data
X_bow_all = pd.DataFrame()

# Loop through each column in the DataFrame
for column in df.columns:
    # Extract the text data from the column
    text_data = df[column]
    
    # Fit and transform the text data
    X_bow = count_vectorizer.fit_transform(text_data)
    
    # Add the transformed data to X_bow_all DataFrame
    X_bow_all[column] = X_bow.toarray().tolist()

# Print the transformed data
print(X_bow_all)


                          Maine_accords      Another_column
0  [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1]  [1, 0, 1, 0, 1, 1]
1  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0]  [0, 1, 0, 1, 0, 1]


In [45]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target labels (y)
X = X_bow_all.drop(columns=['Maine_accords'])
y = df['Maine_accords']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 1
Testing set size: 1


In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Split the data into features (X) and target labels (y)
X = X_bow_all.drop(columns=['Maine_accords'])
y = df['Maine_accords']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def one_hot_encode(data):
    num_samples, num_features = data.shape
    encoded_data = np.zeros((num_samples, num_features * 2), dtype=int)
    for i in range(num_samples):
        for j in range(num_features):
            value = data[i, j]
            if isinstance(value, list):
                value = int(value[0])  # Convert the list to an integer
            encoded_data[i, j * 2 + value] = 1
    return encoded_data

X_train_encoded = one_hot_encode(X_train.to_numpy())
X_test_encoded = one_hot_encode(X_test.to_numpy())

# Initialize the logistic regression model
logistic_regression_model = LogisticRegression()

# Train the model on the training data
logistic_regression_model.fit(X_train_encoded, y_train)

# Evaluate the model on the testing data
accuracy = logistic_regression_model.score(X_test_encoded, y_test)
print("Accuracy:", accuracy)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 'woody, musky, leather, patchouli, rose, oud'

In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)