In [None]:
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import glob
import pyarrow.parquet as pq
import pandas as pd
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import math
import re
import string
import pyarrow as pa
import os
import math
import seaborn as sb
from google.colab import drive
drive.mount('/content/gdrive')
#!unzip /content/gdrive/MyDrive/data.zip -d /content/gdrive/MyDrive/Etsy_Data

In [None]:
#setting path and loading the paraquet file
PATH = f"Your Data Path"
p_f = f'{PATH}/parquet/train'
test = pd.read_parquet(p_f,engine='pyarrow')

In [None]:
#Checking all Columns in Data
test.columns

In [None]:
#Checking all the information related to data
test.info()

In [None]:
#Dropping the columns which are not required
test = test.drop(['type', 'room',
       'craft_type', 'recipient', 'material', 'occasion', 'holiday',
       'art_subject', 'style', 'shape', 'pattern'],axis = 1)

In [None]:
#Basic Cleaning of text data that is filling NA, removing special characters, links and standardising data
test['title'] = test['title'].fillna("")
test['title'] = test['title'].apply(lambda x: x.lower() if pd.notna(x) else x)
test['title'] = test['title'].str.replace(r'\\n','')
test['title'] = test['title'].str.replace('[^A-Za-z\s]','')
test['title'] = test['title'].str.replace('\d+','')
test['title'] = test['title'].str.replace(r'http\S+|www.\S+', '')

test['description'] = test['description'].fillna("")
test['description'] = test['description'].apply(lambda x: x.lower() if pd.notna(x) else x)
test['description'] = test['description'].str.replace(r'\\n','')
test['description'] = test['description'].str.replace('[^A-Za-z\s]','')
test['description'] = test['description'].str.replace('\d+','')
test['description'] = test['description'].str.replace(r'http\S+|www.\S+', '')

test['tags'] = test['tags'].fillna("")
test['tags'] = test['tags'].apply(lambda x: x.lower() if pd.notna(x) else x)
test['tags'] = test['tags'].str.replace(r'\\n','')
test['tags'] = test['tags'].str.replace('[^A-Za-z\s]','')
test['tags'] = test['tags'].str.replace('\d+','')
test['tags'] = test['tags'].str.replace(r'http\S+|www.\S+', '')

In [None]:
#Removing stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
def remove_stop_words(text):
    if pd.isna(text):
        return ''
    words = text.split()
    filtered_words = [word for word in words if len(word) > 2 and word not in stop_words]
    return ' '.join(filtered_words)

test['title'] = test['title'].apply(remove_stop_words)
test['description'] = test['description'].apply(remove_stop_words)
test['tags'] = test['tags'].apply(remove_stop_words)

In [None]:
#Lemmatising data 
lemmatizer = WordNetLemmatizer()

def wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('R'):
            return wordnet.ADV
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        else:
            return wordnet.NOUN

def lemmatization_of_text(text):
    words = word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    lemmatized_words = [lemmatizer.lemmatize(word, wordnet_pos(tag)) for word, tag in pos_tags]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [None]:
test['description']=test['description'].apply(lemmatization_of_text)
test['title']=test['title'].apply(lemmatization_of_text)
test['tags']=test['tags'].apply(lemmatization_of_text)

In [None]:
#Combining all the columns and creating new one 
test['combine'] = test['title'] + ' ' + test['description'] + ' ' + test['tags']

In [None]:
#Predicting one variable at a time
#Logistic regression for predicting Top Category variable

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score,accuracy_score

#Batching the data since it is little heavy to run
batch_size = 20000
num_batches = len(test) // batch_size + (len(test) % batch_size > 0)
model_top = LogisticRegression(max_iter=1000)

vectorizer = CountVectorizer(max_features=10000)
y_true_train = []
y_pred_train = []
y_true_val = []
y_pred_val = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, len(test))
    X_text = test['combine'][start_idx:end_idx]
    y = test['top_category_id'][start_idx:end_idx]

    X = vectorizer.fit_transform(X_text)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model_top.fit(X_train,y_train)

    y_pred_train_batch = model_top.predict(X_train)
    y_true_train.extend(y_train)
    y_pred_train.extend(y_pred_train_batch)

    y_pred_val_batch = model_top.predict(X_test)
    y_true_val.extend(y_test)
    y_pred_val.extend(y_pred_val_batch)

    train_recall = recall_score(y_true_train, y_pred_train, average='weighted')
    train_f1_score = f1_score(y_true_train, y_pred_train, average='weighted')
    train_accuracy = accuracy_score(y_true_train, y_pred_train)

    val_recall = recall_score(y_true_val, y_pred_val, average='weighted')
    val_f1_score = f1_score(y_true_val, y_pred_val, average='weighted')
    val_accuracy = accuracy_score(y_true_val, y_pred_val)

print("Top Category - Recall: {:.4f}, F1 Score: {:.4f}, Accuracy: {:.4f}".format(train_recall, train_f1_score, train_accuracy))
print("Val Top Category - Recall: {:.4f}, F1 Score: {:.4f}, Accuracy: {:.4f}".format(val_recall, val_f1_score, val_accuracy))

In [None]:
#Predicting multiple varibleas at same time
#Using Multi regressor for predicting multiple variables at same time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, accuracy_score
from tqdm import tqdm

# Define batch size and calculate the number of batches
batch_size = 20000
num_batches = len(test) // batch_size + (len(test) % batch_size > 0)

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=10000)

# Initialize progress bar
pbar = tqdm(total=num_batches, desc="Training Progress")

# Initialize lists to store true and predicted values for top and bottom categories
y_true_train_top = []
y_pred_train_top = []
y_true_val_top = []
y_pred_val_top = []

y_true_train_bottom = []
y_pred_train_bottom = []
y_true_val_bottom = []
y_pred_val_bottom = []

# Loop over batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, len(test))
    X_text = test['combine'][start_idx:end_idx]
    y_top = test['top_category_id'][start_idx:end_idx]
    y_bottom = test['bottom_category_id'][start_idx:end_idx]

    X = vectorizer.fit_transform(X_text)
    X_train, X_test, y_train_top, y_test_top, y_train_bottom, y_test_bottom = train_test_split(X, y_top, y_bottom, test_size=0.2, random_state=42)

    # Initialize logistic regression models
    model_top = LogisticRegression(max_iter=1000)
    model_bottom = LogisticRegression(max_iter=1000)

    # Fit logistic regression models for top and bottom categories
    model_top.fit(X_train, y_train_top)
    model_bottom.fit(X_train, y_train_bottom)

    # Predictions for top category
    y_pred_train_top_batch = model_top.predict(X_train)
    y_true_train_top.extend(y_train_top)
    y_pred_train_top.extend(y_pred_train_top_batch)

    y_pred_val_top_batch = model_top.predict(X_test)
    y_true_val_top.extend(y_test_top)
    y_pred_val_top.extend(y_pred_val_top_batch)

    # Predictions for bottom category
    y_pred_train_bottom_batch = model_bottom.predict(X_train)
    y_true_train_bottom.extend(y_train_bottom)
    y_pred_train_bottom.extend(y_pred_train_bottom_batch)

    y_pred_val_bottom_batch = model_bottom.predict(X_test)
    y_true_val_bottom.extend(y_test_bottom)
    y_pred_val_bottom.extend(y_pred_val_bottom_batch)

    # Update progress bar
    pbar.update(1)
    print("Progress bar is updating at this line of code.")

# Close the progress bar
pbar.close()

# Calculate evaluation metrics for top category
train_recall_top = recall_score(y_true_train_top, y_pred_train_top, average='weighted')
train_f1_score_top = f1_score(y_true_train_top, y_pred_train_top, average='weighted')
train_accuracy_top = accuracy_score(y_true_train_top, y_pred_train_top)

val_recall_top = recall_score(y_true_val_top, y_pred_val_top, average='weighted')
val_f1_score_top = f1_score(y_true_val_top, y_pred_val_top, average='weighted')
val_accuracy_top = accuracy_score(y_true_val_top, y_pred_val_top)

# Calculate evaluation metrics for bottom category
train_recall_bottom = recall_score(y_true_train_bottom, y_pred_train_bottom, average='weighted')
train_f1_score_bottom = f1_score(y_true_train_bottom, y_pred_train_bottom, average='weighted')
train_accuracy_bottom = accuracy_score(y_true_train_bottom, y_pred_train_bottom)

val_recall_bottom = recall_score(y_true_val_bottom, y_pred_val_bottom, average='weighted')
val_f1_score_bottom = f1_score(y_true_val_bottom, y_pred_val_bottom, average='weighted')
val_accuracy_bottom = accuracy_score(y_true_val_bottom, y_pred_val_bottom)

# Print evaluation metrics for top category
print("Top Category - Train Recall: {:.4f}, F1 Score: {:.4f}, Accuracy: {:.4f}".format(train_recall_top, train_f1_score_top, train_accuracy_top))
print("Top Category - Val Recall: {:.4f}, F1 Score: {:.4f}, Accuracy: {:.4f}".format(val_recall_top, val_f1_score_top, val_accuracy_top))

# Print evaluation metrics for bottom category
print("Bottom Category - Train Recall: {:.4f}, F1 Score: {:.4f}, Accuracy: {:.4f}".format(train_recall_bottom, train_f1_score_bottom, train_accuracy_bottom))
print("Bottom Category - Val Recall: {:.4f}, F1 Score: {:.4f}, Accuracy: {:.4f}".format(val_recall_bottom, val_f1_score_bottom, val_accuracy_bottom))
