# Amazon ML Challenge

## Importing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn import preprocessing

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Importing Test Data

In [2]:
path = "drive/MyDrive/projects/amazon-ml/dataset/test.csv"
df_test = pd.read_csv(path)
df_test.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,604373,Manuel d'Héliogravure Et de Photogravure En Re...,,,6142
1,1729783,DCGARING Microfiber Throw Blanket Warm Fuzzy P...,[QUALITY GUARANTEED: Luxury cozy plush polyest...,<b>DCGARING Throw Blanket</b><br><br> <b>Size ...,1622
2,1871949,I-Match Auto Parts Front License Plate Bracket...,"[Front License Plate Bracket Made Of Plastic,D...",Replacement for The Following Vehicles:2020 LE...,7540
3,1107571,PinMart Gold Plated Excellence in Service 1 Ye...,[Available as a single item or bulk packed. Se...,Our Excellence in Service Lapel Pins feature a...,12442
4,624253,"Visual Mathematics, Illustrated by the TI-92 a...",,,6318


### Importing Train Data

In [3]:
path = "drive/MyDrive/projects/amazon-ml/dataset/train.csv"
df_train = pd.read_csv(path)
df_train.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


## Data Pre-Processing

In [4]:
punctuation_signs = list("?:!.,;")
nltk.download('punkt')
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
stop_words = list(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Cleaning Training Data

In [5]:
df_train['Title'] = df_train['TITLE'].str.replace("\r", " ")
df_train['Title'] = df_train['Title'].str.replace("\n", " ")
df_train['Title'] = df_train['Title'].str.replace("    ", " ")
df_train['Title'] = df_train['Title'].str.replace('"', '')
df_train['Title'] = df_train['Title'].str.lower()
for punct_sign in punctuation_signs:
  df_train['Title'] = df_train['Title'].str.replace(punct_sign, '')
df_train['Title'] = df_train['Title'].str.replace("'s", "")

  df_train['Title'] = df_train['Title'].str.replace(punct_sign, '')


In [6]:
final_cols = ["Title", "PRODUCT_LENGTH"]
df_train = df_train[final_cols]
df_train = df_train.iloc[:1000, :]

In [9]:
df_train.isna().sum()

Title             0
PRODUCT_LENGTH    0
dtype: int64

In [10]:
df_train.head()

Unnamed: 0,Title,PRODUCT_LENGTH
0,artzfolio tulip flowers blackout curtain for d...,2125.98
1,marks & spencer girls' pyjama sets t86_2561c_n...,393.7
2,priknik horn red electric air horn compressor ...,748.031495
3,alishah women cotton ankle length leggings com...,787.401574
4,the united empire loyalists a chronicle of the...,598.424


### Cleaning Testing Data

In [11]:
df_test['Title'] = df_test['TITLE'].str.replace("\r", " ")
df_test['Title'] = df_test['Title'].str.replace("\n", " ")
df_test['Title'] = df_test['Title'].str.replace("    ", " ")
df_test['Title'] = df_test['Title'].str.replace('"', '')
df_test['Title'] = df_test['Title'].str.lower()
for punct_sign in punctuation_signs:
  df_test['Title'] = df_test['Title'].str.replace(punct_sign, '')
df_test['Title'] = df_test['Title'].str.replace("'s", "")

  df_test['Title'] = df_test['Title'].str.replace(punct_sign, '')


In [12]:
final_cols = ["Title", "PRODUCT_ID"]
df_test = df_test[final_cols]

In [13]:
df_test["Title"].fillna("No Data", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Title"].fillna("No Data", inplace = True)


In [14]:
df_test.isna().sum()

Title         0
PRODUCT_ID    0
dtype: int64

In [15]:
df_test.head()

Unnamed: 0,Title,PRODUCT_ID
0,manuel d'héliogravure et de photogravure en re...,604373
1,dcgaring microfiber throw blanket warm fuzzy p...,1729783
2,i-match auto parts front license plate bracket...,1871949
3,pinmart gold plated excellence in service 1 ye...,1107571
4,visual mathematics illustrated by the ti-92 an...,624253


### Separating Data

In [16]:
X_train, X_test, y_train = df_train["Title"], df_test["Title"], df_train["PRODUCT_LENGTH"]

### Vectorization

In [17]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=5)
print('1')
X_train_vectors_tfidf = tfidf.fit_transform(X_train)
print(X_train_vectors_tfidf.shape)
print('1')
X_test_vectors_tfidf = tfidf.transform(X_test)
print(X_test_vectors_tfidf.shape)

1
(1000, 406)
1
(734736, 406)


### Encoding

In [18]:
label_encoder = preprocessing.LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

## Training Data

In [19]:
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)

## Predicting

In [20]:
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)

In [22]:
y_predict

array([124, 363, 124, ..., 124, 201,   9])

In [23]:
y_predict.shape

(734736,)

In [24]:
d = {"PRODUCT_LENGTH" : y_predict}

In [25]:
df1 = pd.DataFrame(data=d)
df1

Unnamed: 0,PRODUCT_LENGTH
0,124
1,363
2,124
3,160
4,124
...,...
734731,87
734732,124
734733,124
734734,201


In [26]:
df_new = pd.concat([df_test, df1], axis = 1)
df_new.head()

Unnamed: 0,Title,PRODUCT_ID,PRODUCT_LENGTH
0,manuel d'héliogravure et de photogravure en re...,604373,124
1,dcgaring microfiber throw blanket warm fuzzy p...,1729783,363
2,i-match auto parts front license plate bracket...,1871949,124
3,pinmart gold plated excellence in service 1 ye...,1107571,160
4,visual mathematics illustrated by the ti-92 an...,624253,124


In [28]:
l = ["PRODUCT_ID", "PRODUCT_LENGTH"]
df_new = df_new[l]
df_new

Unnamed: 0,PRODUCT_ID,PRODUCT_LENGTH
0,604373,124
1,1729783,363
2,1871949,124
3,1107571,160
4,624253,124
...,...,...
734731,921419,87
734732,2456362,124
734733,841529,124
734734,1190194,201


## Writing Prediction to Submission file

In [29]:
df_new.to_csv("/content/drive/MyDrive/projects/amazon-ml/dataset/submission.csv", index = False, header = True)

## Reading Submission File

In [30]:
path = "/content/drive/MyDrive/projects/amazon-ml/dataset/submission.csv"
df_sub = pd.read_csv(path)
df_sub.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_LENGTH
0,604373,124
1,1729783,363
2,1871949,124
3,1107571,160
4,624253,124


In [31]:
df_sub.isna().sum()

PRODUCT_ID        0
PRODUCT_LENGTH    0
dtype: int64

In [32]:
df_sub.describe()

Unnamed: 0,PRODUCT_ID,PRODUCT_LENGTH
count,734736.0,734736.0
mean,1493725.0,131.003894
std,866977.3,66.442621
min,0.0,0.0
25%,739673.5,94.0
50%,1492776.0,124.0
75%,2242406.0,156.0
max,2999998.0,380.0
