In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import warnings
warnings.filterwarnings("ignore")

import nltk
import re
import string
from collections import Counter
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [24]:
print(pd.__version__)

2.1.4


In [25]:
df = pd.read_csv('/content/train.csv', engine='python', on_bad_lines='skip', quotechar='"')


In [26]:
df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [27]:
df.shape

(277723, 6)

In [28]:
df['BULLET_POINTS'].iloc[0]

'[LUXURIOUS & APPEALING: Beautiful custom-made curtains to decorate any home or office | Includes inbuilt tieback to hold the curtain | Completely finished and ready to hang on walls & windows,MATERIAL: Luxurious & versatile fabric with a natural finish | High colour fastness | State-of-the-art digital printing ensures colour consistency and prevents any fading | Eyelets; Cotton Canvas; Width 4.5feet (54inch) | Multicolour | PACKAGE: 2 Room Curtains Eyelets | SIZE: Height 5 feet (60 inch); SET OF 2 PCS,BLACKOUT CURTAIN: 100% opaque & heavy premium cotton canvas fabric | Tight knitted, long life & durable fabric | Printing only on front side with a plain colour back side,MADE TO PERFECTION: Large eyelets at the top to put hanging hooks | Perfectly tailored seams for durability | Refined stitching with a matching thread color,QUALITY ASSURED: Gentle wash with similar colors in cold water | Avoid direct sunlight to prevent fading | Dispatched after MULTIPLE QUALITY CHECKS]'

In [29]:
df["PRODUCT_ID"].nunique()

277723

In [30]:
df.shape

(277723, 6)

In [31]:
## We can make PRODUCT_ID as index
df.set_index('PRODUCT_ID', inplace=True)

In [32]:
df.index
## we updated the indexes of dataframe

Index([1925202, 2673191, 2765088, 1594019,  283658, 2152929,  413758, 2026580,
       2050239, 2998633,
       ...
       1215501, 1049822,  306651,   88598, 2595290,  809965,  616968, 1593738,
        120557,  251395],
      dtype='int64', name='PRODUCT_ID', length=277723)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 277723 entries, 1925202 to 251395
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   TITLE            277721 non-null  object 
 1   BULLET_POINTS    174263 non-null  object 
 2   DESCRIPTION      134706 non-null  object 
 3   PRODUCT_TYPE_ID  277723 non-null  int64  
 4   PRODUCT_LENGTH   277723 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 12.7+ MB


In [34]:
def df_information(data):
  shape         = data.shape
  info          = data.info()
  describe      = data.describe()
  column_name   = data.columns
  null_values   = data.isnull().sum()

  return shape, info, describe, column_name, null_values

In [35]:
df.isna().sum()

Unnamed: 0,0
TITLE,2
BULLET_POINTS,103460
DESCRIPTION,143017
PRODUCT_TYPE_ID,0
PRODUCT_LENGTH,0


In [36]:
## Joining the columns which are the inputs
new_df = df.copy()
# need to fill the nan values else it gonna give error
new_df["inputs"] = df[["TITLE", "BULLET_POINTS", "DESCRIPTION"]].fillna(" ").apply(lambda x: ' '.join(x), axis=1)

In [37]:
new_df.head()

Unnamed: 0_level_0,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH,inputs
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98,ArtzFolio Tulip Flowers Blackout Curtain for D...
2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...
2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495,PRIKNIK Horn Red Electric Air Horn Compressor ...
1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574,ALISHAH Women's Cotton Ankle Length Leggings C...
283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424,The United Empire Loyalists: A Chronicle of th...


In [38]:
new_df = new_df.drop(["TITLE", "BULLET_POINTS", "DESCRIPTION", "PRODUCT_TYPE_ID"], axis=1)

In [39]:
new_df.head()

Unnamed: 0_level_0,PRODUCT_LENGTH,inputs
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1925202,2125.98,ArtzFolio Tulip Flowers Blackout Curtain for D...
2673191,393.7,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...
2765088,748.031495,PRIKNIK Horn Red Electric Air Horn Compressor ...
1594019,787.401574,ALISHAH Women's Cotton Ankle Length Leggings C...
283658,598.424,The United Empire Loyalists: A Chronicle of th...


In [40]:
new_df.dropna(axis=0, inplace=True)

In [41]:
new_df.isna().sum()

Unnamed: 0,0
PRODUCT_LENGTH,0
inputs,0


In [42]:
new_df.shape

(277723, 2)

In [43]:
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
def remove_emojis_and_symbols(text):
    # This regex targets most emojis, hearts, and various symbols
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & Map symbols
        u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols (includes hearts)
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

In [45]:
# will start preprocessing the inputs
def preprocessing(data):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))

    def clean_sentence(sentence):
        # Convert to lowercase
        sentence = sentence.lower()

        # Remove non-alphanumeric characters
        sentence = re.sub("[^a-z0-9]", " ", sentence)

        # Remove URLs
        sentence = re.sub(r'(http|https|ftp|ssh)://[\w_-]+(?:\.[\w_-]+)+[\w.,@?^=%&/~+#-]*[\w@?^=%&/~+#-]?', '', sentence)

        # removing emojis and other symobls that are not cleaned
        sentence = remove_emojis_and_symbols(sentence)

        # Remove stopwords
        sentence = " ".join([word for word in sentence.split() if word not in stop_words])

        # Clean any potential HTML tags
        sentence = BeautifulSoup(sentence, "lxml").get_text()

        # Lemmatize words
        sentence = " ".join([lemmatizer.lemmatize(word) for word in sentence.split()])

        return sentence

    # Apply the cleaning function to each row
    data['inputs'] = data['inputs'].apply(clean_sentence)

    return data

In [46]:
cleaned_df     = preprocessing(new_df)

In [47]:
cleaned_df.head()

Unnamed: 0_level_0,PRODUCT_LENGTH,inputs
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1925202,2125.98,artzfolio tulip flower blackout curtain door w...
2673191,393.7,mark spencer girl pyjama set t86 2561c navy mi...
2765088,748.031495,priknik horn red electric air horn compressor ...
1594019,787.401574,alishah woman cotton ankle length legging comb...
283658,598.424,united empire loyalist chronicle great migration


In [48]:
cleaned_df.shape

(277723, 2)

In [49]:
def vocabulary(df):
    words = [word for sentence in df.inputs for word in sentence.split()]
    word_counts = Counter(words)
    unique_words = set(word_counts)
    sorted_words = word_counts.most_common()

    return sorted_words, unique_words

In [None]:
word_counts, unique_words = vocabulary(cleaned_df)

In [None]:
word_counts

In [None]:
len(unique_words)

In [None]:
words = []
for i, (w, c) in enumerate(word_counts):
  if c < 5:
    words.append(w)

In [None]:
len(words)

In [None]:
def remove_rare_wors(word_counts):
  word_list = []
  for i, (word, cout) in enumerate(word_counts):
      if word.isdigit():
        continue
      elif cout > 5:
        word_list.append(word)

  return word_list

In [None]:
clenaed_vocabulary_   = remove_rare_wors(word_counts)

In [None]:
type(45)

In [None]:
len(clenaed_vocabulary_)

In [None]:
clenaed_vocabulary_

In [None]:
def word_to_int(clenaed_vocabulary_):
  word_to_numeric = {}
  for i, word in enumerate(clenaed_vocabulary_):
    word_to_numeric[word] = i + 1

  return word_to_numeric

In [None]:
word_to_numeric   = word_to_int(clenaed_vocabulary_)

In [None]:
word_to_numeric

In [None]:
def sentence_to_int(data):
  data["inputs"] = data["inputs"].apply(lambda x: [word_to_numeric[word] for word in x.split() if word in clenaed_vocabulary_])

  return data

In [None]:
cleaned_df_to_numeric = sentence_to_int(cleaned_df)

In [None]:
cleaned_df_to_numeric["length"] = cleaned_df_to_numeric["inputs"].apply(lambda x: len(x))

In [None]:
cleaned_df_to_numeric.head()

In [None]:
cleaned_df_to_numeric.describe()

In [None]:
cleaned_df_to_numeric.shape

In [None]:
def padding():
  pass

In [85]:
bad_lines = []

def process_bad_lines(line):
    bad_lines.append(line)


df = pd.read_csv('/content/train.csv', engine='python', on_bad_lines=process_bad_lines)

# Now you can inspect `bad_lines` to see what kind of problems they have
print(bad_lines[:10])

[]


In [81]:
df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [86]:
df.shape

(277723, 6)

In [6]:
import pandas as pd

def debug_bad_lines(line):
    print(f"Problematic line: {line}")
    return None

df = pd.read_csv('/content/train.csv', engine='python', on_bad_lines=debug_bad_lines)


In [7]:
df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [8]:
df.shape

(277723, 6)

In [14]:
import csv

# Read the file and try to fix common issues (e.g., unclosed quotes)
def preprocess_csv(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        for row in reader:
            # Check for rows that don't have the expected number of columns
            if len(row) == 6:
                writer.writerow(row)
            else:
                # Try to fix the row, e.g., by merging extra columns or handling unclosed quotes
                # (custom logic can be added here based on the specific issues in your file)
                pass

# Call the function
preprocess_csv('/content/train.csv', '/content/train_cleaned.csv')


In [13]:
l = [1,2, 3, 4]
len(l)

4

In [15]:
df = pd.read_csv("/content/train_cleaned.csv")

In [16]:
df.shape

(277723, 6)

In [17]:
def fix_unclosed_quotes(line):
    # Simple regex to fix unclosed quotes
    return re.sub(r'(".*?)(,)', r'\1"\2', line)

with open('/content/train.csv', 'r') as infile, open('/content/train_fixed.csv', 'w') as outfile:
    for line in infile:
        fixed_line = fix_unclosed_quotes(line)
        outfile.write(fixed_line)