In [54]:
import re
import string
import nltk

import pandas as pd
import numpy as np

from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score

from nltk.corpus import stopwords
nltk.download('stopwords') # TODO: Put this somewhere else - perhaps when listener starts

[nltk_data] Downloading package stopwords to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
# import ai_functions
# from ai_functions import calculate_metrics_and_threshold, write_dataframe_to_json_on_s3, write_model_to_s3
# from ai_functions import download_model_from_s3, pull_dataframe_from_s3_as_json
# from definitions import NGRAM_LOWER, NGRAM_UPPER, MIN_DF, MAX_FEATURES, MAX_ITER, CV_FOLDS, INNOCENT_SCORE_FACTOR

In [56]:
text_df = pd.read_csv("text_relevant.csv")
text_df = text_df[["description", "relevant"]]

In [57]:
def clean_text(row):
    text = row["description"]
    text = text.lower() # make text lower case
    text = re.sub("[%s]" % re.escape(string.punctuation), " ", text) # remove punctuation
    text = re.sub("\n", "", text) # remove line breaks

    # for i in product_name:
    #     text = re.sub(i, " " + i + " ", text) # replace "i" with " i " in description.

    return text

In [58]:
text_df["clean_text"] = text_df.apply(clean_text, axis=1)
text_df

Unnamed: 0,description,relevant,clean_text
0,Boys Among Us Hoodies Super Cool Print Clothes...,1,boys among us hoodies super cool print clothes...
1,Cat And Dog Cardboard Printing Jigsaw Puzzle M...,0,cat and dog cardboard printing jigsaw puzzle m...
2,0098 30 Pcs 21pack Sensory 4 Piece Puzzle Fidg...,0,0098 30 pcs 21pack sensory 4 piece puzzle fidg...
3,Boys Sweatshirt Among us 3D Hoodies Cool Fashi...,1,boys sweatshirt among us 3d hoodies cool fashi...
4,20cm Among Us Soft Plush Toys Vocal Butt Toy S...,0,20cm among us soft plush toys vocal butt toy s...
...,...,...,...
464,Fashionable summer 3D unicorn T-shirts are amo...,0,fashionable summer 3d unicorn t shirts are amo...
465,Funny New Mommy Baby 2022 Family Look White Co...,0,funny new mommy baby 2022 family look white co...
466,Children Super Cool Game Among Us T-shirt Summ...,0,children super cool game among us t shirt summ...
467,New Game Among Us T Shirt Boys Girls Harajuku ...,0,new game among us t shirt boys girls harajuku ...


In [59]:
def tokenise_text(row):
    text = row["clean_text"]
    regex_tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") # tokenize sentence without punctuation
    tokenized_text = regex_tokenizer.tokenize(text)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words("english")]
    combined_text = " ".join(remove_stopwords) # join strings into sentence after stopwords removal
    del regex_tokenizer, tokenized_text, remove_stopwords
    return combined_text

In [60]:
text_df["tokenise_text"] = text_df.apply(tokenise_text, axis=1)
text_df

Unnamed: 0,description,relevant,clean_text,tokenise_text
0,Boys Among Us Hoodies Super Cool Print Clothes...,1,boys among us hoodies super cool print clothes...,boys among us hoodies super cool print clothes...
1,Cat And Dog Cardboard Printing Jigsaw Puzzle M...,0,cat and dog cardboard printing jigsaw puzzle m...,cat dog cardboard printing jigsaw puzzle manuf...
2,0098 30 Pcs 21pack Sensory 4 Piece Puzzle Fidg...,0,0098 30 pcs 21pack sensory 4 piece puzzle fidg...,0098 30 pcs 21pack sensory 4 piece puzzle fidg...
3,Boys Sweatshirt Among us 3D Hoodies Cool Fashi...,1,boys sweatshirt among us 3d hoodies cool fashi...,boys sweatshirt among us 3d hoodies cool fashi...
4,20cm Among Us Soft Plush Toys Vocal Butt Toy S...,0,20cm among us soft plush toys vocal butt toy s...,20cm among us soft plush toys vocal butt toy s...
...,...,...,...,...
464,Fashionable summer 3D unicorn T-shirts are amo...,0,fashionable summer 3d unicorn t shirts are amo...,fashionable summer 3d unicorn shirts among us ...
465,Funny New Mommy Baby 2022 Family Look White Co...,0,funny new mommy baby 2022 family look white co...,funny new mommy baby 2022 family look white co...
466,Children Super Cool Game Among Us T-shirt Summ...,0,children super cool game among us t shirt summ...,children super cool game among us shirt summer...
467,New Game Among Us T Shirt Boys Girls Harajuku ...,0,new game among us t shirt boys girls harajuku ...,new game among us shirt boys girls harajuku su...


In [61]:
text_df.head()

Unnamed: 0,description,relevant,clean_text,tokenise_text
0,Boys Among Us Hoodies Super Cool Print Clothes...,1,boys among us hoodies super cool print clothes...,boys among us hoodies super cool print clothes...
1,Cat And Dog Cardboard Printing Jigsaw Puzzle M...,0,cat and dog cardboard printing jigsaw puzzle m...,cat dog cardboard printing jigsaw puzzle manuf...
2,0098 30 Pcs 21pack Sensory 4 Piece Puzzle Fidg...,0,0098 30 pcs 21pack sensory 4 piece puzzle fidg...,0098 30 pcs 21pack sensory 4 piece puzzle fidg...
3,Boys Sweatshirt Among us 3D Hoodies Cool Fashi...,1,boys sweatshirt among us 3d hoodies cool fashi...,boys sweatshirt among us 3d hoodies cool fashi...
4,20cm Among Us Soft Plush Toys Vocal Butt Toy S...,0,20cm among us soft plush toys vocal butt toy s...,20cm among us soft plush toys vocal butt toy s...


In [69]:
NGRAM_LOWER = 1
NGRAM_UPPER = 3
MIN_DF = 5
MAX_FEATURES = 1000000

count_vectorizer = CountVectorizer(ngram_range=(NGRAM_LOWER, NGRAM_UPPER),
                                       min_df=MIN_DF,
                                       max_features=MAX_FEATURES)

In [75]:
vectors = count_vectorizer.fit_transform(text_df["tokenise_text"].values.astype('U'))

In [74]:
text_df.head()

Unnamed: 0,description,relevant,clean_text,tokenise_text,count_vectorizer
0,Boys Among Us Hoodies Super Cool Print Clothes...,1,boys among us hoodies super cool print clothes...,boys among us hoodies super cool print clothes...,
1,Cat And Dog Cardboard Printing Jigsaw Puzzle M...,0,cat and dog cardboard printing jigsaw puzzle m...,cat dog cardboard printing jigsaw puzzle manuf...,
2,0098 30 Pcs 21pack Sensory 4 Piece Puzzle Fidg...,0,0098 30 pcs 21pack sensory 4 piece puzzle fidg...,0098 30 pcs 21pack sensory 4 piece puzzle fidg...,
3,Boys Sweatshirt Among us 3D Hoodies Cool Fashi...,1,boys sweatshirt among us 3d hoodies cool fashi...,boys sweatshirt among us 3d hoodies cool fashi...,
4,20cm Among Us Soft Plush Toys Vocal Butt Toy S...,0,20cm among us soft plush toys vocal butt toy s...,20cm among us soft plush toys vocal butt toy s...,


In [79]:
MAX_ITER = 100000
CV_FOLDS = 5
URGENCY_WEIGHTING = 1
model = LogisticRegression(max_iter=MAX_ITER)