In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import joblib
import streamlit as st

  from tqdm.autonotebook import tqdm, trange


In [3]:
df= pd.read_csv('prokick_competition.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,normalized_title,offer_price_amazon,mrp_amazon,offer_price_prokick,mrp_prokick,offer_price_instasports,mrp,clean_title,color,pack_size,limited_edition,speed
0,317,317,"yonex aerosensa 10 feather shuttlecock, 3 cans",,,7015.0,8250.0,,,"yonex aerosensa 10 feather shuttlecock, 3 cans",,,No,
1,318,318,"yonex 11488 wrist band, navy/red (pack of 2)",,,390.0,398.0,,,"yonex 11488 wrist band, /","navy, red",pack of 2,No,
2,319,319,yonex ac 152ex badminton net,,,1745.0,2490.0,,,yonex ac 152ex badminton net,,,No,
3,320,320,yonex ezone 98 tennis racquet,,,15745.0,26240.0,,,yonex ezone 98 tennis racquet,,,No,
4,321,321,yonex voltric lite 20i badminton racquet,,,1839.0,2790.0,,,yonex voltric lite 20i badminton racquet,,,No,


In [6]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.reset_index(drop=True)

In [10]:
df.columns

Index(['normalized_title', 'offer_price_amazon', 'mrp_amazon',
       'offer_price_prokick', 'mrp_prokick', 'offer_price_instasports', 'mrp',
       'clean_title', 'color', 'pack_size', 'limited_edition', 'speed'],
      dtype='object')

In [85]:
df['available_amazon'] = df['offer_price_amazon'].notna().astype(int)
df['available_instasport'] = df['offer_price_instasports'].notna().astype(int)

In [87]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [88]:
embeddings = model.encode(df['clean_title'].tolist())

In [89]:
X = embeddings
y_amazon = df['available_amazon']
y_instasport = df['available_instasport']

In [90]:
X_train, X_test, y_amazon_train, y_amazon_test, y_instasport_train, y_instasport_test = train_test_split(
    X, y_amazon, y_instasport, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [91]:
# Train Amazon model
clf_amazon = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
clf_amazon.fit(X_train_scaled, y_amazon_train)

# Train Instasport model
clf_instasport = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
clf_instasport.fit(X_train_scaled, y_instasport_train)


In [92]:
def predict_availability(title):
    embedding = model.encode([title])
    embedding_scaled = scaler.transform(embedding)
    
    amazon_prob = clf_amazon.predict_proba(embedding_scaled)[0][1]
    instasport_prob = clf_instasport.predict_proba(embedding_scaled)[0][1]
    
    return {
        'Amazon': amazon_prob,
        'Instasport': instasport_prob
    }

In [93]:
print("Amazon availability distribution:")
print(df['available_amazon'].value_counts(normalize=True))
print("\nInstasport availability distribution:")
print(df['available_instasport'].value_counts(normalize=True))

Amazon availability distribution:
available_amazon
0    1.0
Name: proportion, dtype: float64

Instasport availability distribution:
available_instasport
0    0.98452
1    0.01548
Name: proportion, dtype: float64


`
   This shows that 100% of the samples are labeled as 0 for Amazon availability. In other words, there are no products in your dataset that are available on Amazon (or at least, none that have a price listed for Ama
.

2. Instasport availdicates that:
   - 98.452% of the samples are labeled as 0 (not available on Instasport)
   - Only 1.548% of the samples are labeled as 1 (available on Instasport)

These distributions reveal several important points:

1. Extreme imbalance for Amazon: There's a complete absence of positive samples (available products) for Amazon. This explains why you're getting the warnings about undefined metrics. Your model can't learn to predict availability on Amazon because there are no examples of available products in your dataset.

2. Severe imbalance for Instasport: While there are some positive samples for Instasport, the dataset is still highly imbalanced. With only about 1.5% of products available, this is a very skewed distribution.

3. Potential data quality issues: The complete absence of Amazon availability is unusual and might indicate a data collection or processing issue. You may want to double-check your data source and preprocessing steps.

4. Modeling challenges: These distributions make it very challenging to build effective predictive models:
   - For Amazon, it's currently impossible to predict availability since there are no positive examples.
   - For Instasport, the severe class imbalance will likely lead to a model that's biased towards predicting "not available" most of the time.

Given these insights, you should consider the following steps:

1. Verify data integrity: Ensure that the Amazon availability data is correct. If it's not, you'll need to fix the data collection or processing.

2. Handle class imbalance: For Instasport, you'll need to use techniques like oversampling (e.g., SMOTE), undersampling, or adjusting class weights to address the imbalance.

3. Reconsider the problem formulation: If the Amazon data is correct (i.e., truly no products are available on Amazon), you might need to reconsider including Amazon in your model. Perhaps focus on predicting Instasport availability only, or collect data from other platforms where there's more variability in availability.

4. Collect more data: If possible, try to gather more data, especially for the underrepresented classes (available products).

5. Feature engineering: Consider creating additional features that might help predict availability, as the produ

<hr>
 can build an effective predictive model.

In [95]:
df_training=pd.read_csv("attribute.csv")
df_training.head()

Unnamed: 0.1,Unnamed: 0,normalized_title,offer_price_amazon,mrp_amazon,offer_price_prokick,mrp_prokick,offer_price_instasports,mrp,clean_title,color,pack_size,limited_edition,speed
0,0,li-ning turbo strung badminton racket,1699.0,3890.0,,,,,li-ning turbo strung badminton racket,,,No,
1,1,li-ning turbo strung badminton racket,1563.08,3890.0,,,,,li-ning turbo strung badminton racket,,,No,
2,2,li-ning turbo strung badminton racket,1441.28,3890.0,,,,,li-ning turbo strung badminton racket,,,No,
3,3,li-ning turbo strung badminton racket,1554.55,3890.0,,,,,li-ning turbo strung badminton racket,,,No,
4,4,li-ning turbo strung badminton racket,1444.15,3890.0,,,,,li-ning turbo strung badminton racket,,,No,


In [97]:
df_training = df_training.loc[:, ~df_training.columns.str.contains('^Unnamed')]
df_training = df_training.reset_index(drop=True)

In [98]:
df_training.shape

(2433, 12)

In [99]:
df_training['available_amazon'] = df_training['offer_price_amazon'].notna().astype(int)
df_training['available_instasport'] = df_training['offer_price_instasports'].notna().astype(int)

In [100]:
embeddings = model.encode(df_training['clean_title'].tolist())

In [101]:
X = embeddings
y_amazon = df_training['available_amazon']
y_instasport = df_training['available_instasport']

X_train, X_test, y_amazon_train, y_amazon_test, y_instasport_train, y_instasport_test = train_test_split(
    X, y_amazon, y_instasport, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [102]:
# Train Amazon model
clf_amazon = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
clf_amazon.fit(X_train_scaled, y_amazon_train)

# Train Instasport model
clf_instasport = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
clf_instasport.fit(X_train_scaled, y_instasport_train)


In [103]:
def predict_availability(title):
    embedding = model.encode([title])
    embedding_scaled = scaler.transform(embedding)
    
    amazon_prob = clf_amazon.predict_proba(embedding_scaled)[0][1]
    instasport_prob = clf_instasport.predict_proba(embedding_scaled)[0][1]
    
    return {
        'Amazon': amazon_prob,
        'Instasport': instasport_prob
    }

In [104]:
df_training.columns

Index(['normalized_title', 'offer_price_amazon', 'mrp_amazon',
       'offer_price_prokick', 'mrp_prokick', 'offer_price_instasports', 'mrp',
       'clean_title', 'color', 'pack_size', 'limited_edition', 'speed',
       'available_amazon', 'available_instasport'],
      dtype='object')

In [111]:
df_training.to_csv("df_training.csv")

In [116]:
print("Amazon availability overall distribution:")
print(df_training['available_amazon'].value_counts(normalize=True))
print("\nInstasport availability overall distribution:")
print(df_training['available_instasport'].value_counts(normalize=True))

Amazon availability overall distribution:
available_amazon
0    0.870941
1    0.129059
Name: proportion, dtype: float64

Instasport availability overall distribution:
available_instasport
1    0.739005
0    0.260995
Name: proportion, dtype: float64


In [117]:
y_amazon_pred = clf_amazon.predict(X_test_scaled)
amazon_accuracy = accuracy_score(y_amazon_test, y_amazon_pred)
amazon_f1 = f1_score(y_amazon_test, y_amazon_pred)
amazon_precision = precision_score(y_amazon_test, y_amazon_pred)
amazon_recall = recall_score(y_amazon_test, y_amazon_pred)

In [118]:
y_instasport_pred = clf_instasport.predict(X_test_scaled)
instasport_accuracy = accuracy_score(y_instasport_test, y_instasport_pred)
instasport_f1 = f1_score(y_instasport_test, y_instasport_pred)
instasport_precision = precision_score(y_instasport_test, y_instasport_pred)
instasport_recall = recall_score(y_instasport_test, y_instasport_pred)

In [119]:
print("Amazon Model Metrics:")
print(f"Accuracy: {amazon_accuracy:.4f}")
print(f"F1 Score: {amazon_f1:.4f}")
print(f"Precision: {amazon_precision:.4f}")
print(f"Recall: {amazon_recall:.4f}")

Amazon Model Metrics:
Accuracy: 0.9630
F1 Score: 0.8548
Precision: 0.9138
Recall: 0.8030


In [120]:
print("\nInstasport Model Metrics:")
print(f"Accuracy: {instasport_accuracy:.4f}")
print(f"F1 Score: {instasport_f1:.4f}")
print(f"Precision: {instasport_precision:.4f}")
print(f"Recall: {instasport_recall:.4f}")


Instasport Model Metrics:
Accuracy: 0.9671
F1 Score: 0.9774
Precision: 0.9638
Recall: 0.9914


In [121]:
# clf_amazon = joblib.load('clf_amazon.joblib')
# clf_instasport = joblib.load('clf_instasport.joblib')
# scaler = joblib.load('scaler.joblib')

# @st.cache(allow_output_mutation=True)
# def load_model():
#     return SentenceTransformer('paraphrase-MiniLM-L6-v2')

# model = load_model()

In [122]:
def predict_availability(title):
    embedding = model.encode([title])
    embedding_scaled = scaler.transform(embedding)
    # Predict probabilities for Amazon and Instasport
    amazon_prob = clf_amazon.predict_proba(embedding_scaled)[0][1]
    instasport_prob = clf_instasport.predict_proba(embedding_scaled)[0][1]
    avg_confidence = (amazon_prob + instasport_prob) / 2
    
    return amazon_prob, instasport_prob, avg_confidence

def get_availability_status(probability):
    if probability > 0.7:
        return "Highly likely to be available"
    elif probability > 0.3:
        return "May be available"
    else:
        return "Unlikely to be available"

# User interface
while True:
    user_input = input("Enter a product name (or 'quit' to exit): ")
    
    if user_input.lower() == 'quit':
        break
    
    amazon_prob, instasport_prob, avg_confidence = predict_availability(user_input)
    
    print(f"\nPredictions for '{user_input}':")
    print(f"Amazon: {amazon_prob:.2%} - {get_availability_status(amazon_prob)}")
    print(f"Instasport: {instasport_prob:.2%} - {get_availability_status(instasport_prob)}")
    print(f"Average confidence: {avg_confidence:.2%}")
    print()

print("Thank you for using the product availability predictor!")

Enter a product name (or 'quit' to exit):  quit


Thank you for using the product availability predictor!


In [123]:
import joblib
joblib.dump(clf_amazon, 'clf_amazon.joblib')
joblib.dump(clf_instasport, 'clf_instasport.joblib')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']