## Amazon ML Challenge

#### Index
- Libraries and Modules
- Importing Data
- Creating function
- Straitified Data Sampling
- Cleaning
- Model and Training
- Testing Final Result

## Libraries and Modules

In [None]:
import pandas as pd
import numpy as np

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedShuffleSplit

import pickle
import string
import re

#ntlk
from nltk.corpus import stopwords  
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
token = TweetTokenizer()
stemmer = PorterStemmer()
stopwords_english = stopwords.words('english')

## Importing Data

In [None]:
train = pd.read_csv("train.csv",escapechar="\\",quoting = 3)
test = pd.read_csv("test.csv"escapechar="\\",quoting = 3)

## Creating helper function

In [None]:
# Creating Product Column
def create_product(df):
    df["BULLET_POINTS"].fillna("",inplace=True)
    df["DESCRIPTION"].fillna("",inplace=True)
    df["TITLE"].fillna("",inplace=True)
    df["Products"] = df["BULLET_POINTS"] + " " + df["DESCRIPTION"] + " " + df["TITLE"]
    df.drop(["BULLET_POINTS","DESCRIPTION","TITLE"],axis = 1,inplace=True)

# Cleaning text
def clean_data(data):
    X = data.to_numpy()
    for i in range(X.shape[0]):
        X[i] = X[i].lower()
    X = list(X)
    for i in range(len(X)):
        X[i] = re.sub(r'^RT[\s]+', '', X[i])
        X[i] = re.sub(r'https?:\/\/.*[\r\n]*$-', '', X[i])
        X[i] = re.sub(r'#', '', X[i])
    for i in range(len(X)):
        X[i] = token.tokenize(X[i])
    X_clean = []
    for i in range(len(X)):
        new = []
        for word in X[i]:
            if(word not in stopwords_english and word not in string.punctuation):
                new.append(word)
        X_clean.append(new)
    for i in range(len(X_clean)):
        for j in range(len(X_clean[i])):
            X_clean[i][j] = stemmer.stem(X_clean[i][j])
        if(i%1000 == 0):
            print(i)
    X_sentence = []
    for i in range(len(X_clean)):
        temp = ""
        for j in range(len(X_clean[i])):
            temp = temp + " " + X_clean[i][j]
        X_sentence.append(temp)
    return X_sentence

# Creating prection csv files
def create(i,model):
    y_pred = model.predict(test["Products_clean"])
    B = pd.DataFrame({"PRODUCT_ID":test["PRODUCT_ID"],"BROWSE_NODE_ID":y_pred})
    B.to_csv("PATS"+i+".csv",index=False)

## Stratified Data Sampling

In [None]:
create_product(train)

In [None]:
split = StratifiedShuffleSplit(n_splits=1, train_size=0.30 )
for train_index,rest_index in split.split(train, train["BROWSE_NODE_ID"]):
     strat_train_set = train.loc[train_index]

## Cleaning

In [None]:
create_product(test)

In [None]:
strat_train_set["Products_clean"] = clean_data(strat_train_set["Products"])
test["Products_clean"] = clean_data(test["Products"])

## Model and Training

In [None]:
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('model',LinearSVC(verbose = 1)) ])

In [None]:
model.fit(strat_train_set["Products_clean"],strat_train_set["BROWSE_NODE_ID"])

## Final Result

In [None]:
create("1",model)