# ETL Pipeline:

Creating a preprocessing pipleine to extract the data, transform the data according to our solution needs and load the data into a data base to be used later.   

In [126]:
import numpy as np
import pandas as pd
import itertools
from collections import Counter
from nltk.corpus import stopwords

# 1.Data Loading 

In [127]:
def load_data(Amazon_filepath):
    
    #importing amazon apparel dataset
    data = pd.read_json(Amazon_filepath)
    
    #printing the shape of our data set 
    print('The data has {} data points and {} features \n'.format(data.shape[0],data.shape[1]))
    #printing a space for presentation
    
    
    #keeping just the pertinent features
    data = data[['asin','product_type_name', 'formatted_price','title','medium_image_url']]
    
    print('The data after removing irrelevant features has {} and it contains these {} features. The names of the features are {} \n \n'.format(data.shape[0],data.shape[1],list(data.columns)))
    
    return data

# 2.Data Analysis

In [128]:
def analysis(data):
    
    #Basic stats for product type
    print('The basic statistics for product type on amazon are as follows: \n{}\n\n'.format(data['product_type_name'].describe()))
    
    #product type segregation
    print('Product type count:\n{}\n\n'.format(Counter(list(data['product_type_name'])).most_common(10)))
    
    #basic stats for titles
    print('The basic statistics for product type on amazon are as follows: \n{}\n\n'.format(data['title'].describe()))
    
    #Basic stats for product type
    print('{} % of the total points have a listed price \n \n'.format(data[~data['formatted_price'].isnull()].shape[0]/data.shape[0]*100))

                                                                 

# 3. Data Cleaning

In [129]:
#function to remove stopwords
def text_preprocessing(text,index,column,stopword):
    
    if type(text) is not int:
        strng = ""
        for words in text.split():
            #removing special characters 
            word = ("".join(i for i in words if i.isalnum()))
            
            #lowering the words
            word = word.lower()
            
            #removing stopwords
            if word not in stopword:
                strng += word + " " 
        data[column][index] = str

In [130]:
def Data_cleaning(data):
    # removing apparels without a price as we need a price to sell apparelss
    data = data[~data['formatted_price'].isnull()]
    print('The number of products (data points) remaining after removing products without a price: \n{}\n'.format(data.shape[0]))
        
    #removing appaerls without a title as we need titles for vectorization
    #distance based similarity recommendation for title vectorization
    data = data[~data['title'].isnull()]
    print('The number of products (data points) remaining after removing products without a title description required for vectorization:\n{}\n'.format(data.shape[0]))
    
    #removing apparels with small length titles as they might not adequately describe apparel
    data = data[data['title'].apply(lambda x : len(x.split())>4)]
    print('The number of products (data points) remaining after removing products with insufficient title descriptions required for vectorization:\n{}\n'.format(data.shape[0]))
   


    #removing duplicate 'titles'
    #Below is the code to remove similar titles with just 3 words differing from its duplicate 
    ##################### start ############################
    indices = []
    for i,row in data.iterrows():
        indices.append(i)
    
    asins = []
    while len(indices)!= 0:
        i = indices.pop()
    
        asins.append(data['asin'].loc[i])
        
        a = data['title'].loc[i].split()
        
        # store the list of words of ith string in a lista = data['title'].loc[i].spilt()
        for j in indices:
            
            # store the list of words of jth string in a list b = data['title'].loc[j].spilt()
            b = data['title'].loc[j].split()
            
            #storing the max len between list a or b
            length = max(len(a),len(b))
            
            # count is used to store the number of words that are matched in both lists
            count = 0
            
            # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
            # example: a =['a', 'b', 'c', 'd']
            # b = ['a', 'b', 'd']
            # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
            for h in itertools.zip_longest(a,b):
                if (h[0]==h[1]):
                    count += 1                    
                
            if (length - count) < 3:
                indices.remove(j)            
                
    #keeping apparel data points without a duplicate
    data = data[data['asin'].isin(asins)]
    
    print('The number of products (data points) remaining after removing products with duplicate title descriptions:\n{}\n\n'.format(data.shape[0]))
    ################### end #####################
    
    
    #Removing stopwords from title description
    #dictionary containing all the stopwords
    stopword = set(stopwords.words('english'))
        
    for index,rows in data.iterrows():
        text_preprocessing(row['title'],index,'title',stopword)
    
        
    return data

# 4.Data Saving

In [131]:
def save_data(data,Filepath):
    
    #saving data in a pickle file
    data.to_pickle(Filepath)


# 5.Main Excecution Function

In [1]:
def main(Amazon_filepath,Filepath):
    
        print('LOADING DATA...{}\n\n '.format(Amazon_filepath))
        data = load_data(Amazon_filepath)

        print('DATA ANALYSIS...\n\n')
        analysis(data)
        
        print('CLEANING DATA...\n\n')
        data = Data_cleaning(data)
        
        print('SAVING DATA IN PICKLE FILE PREPROCESSED {}...\n\n'.format(Filepath))
        save_data(data,Filepath)
        
        print('Cleaned data saved to pickle file preprocessed in Pickle folder')