In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/10000-amazon-products-dataset/Amazon_Products.csv


```
1) Pick a dataset

2) Pose at least three questions related to business or real-world applications of how the data could be used.

3) Create a Jupyter Notebook, using any associated packages you'd like, to:

    a. Prepare data:
        - Gather necessary data to answer your questions
        - Handle categorical and missing data
        - Provide insight into the methods you chose and why you chose them
    b. Analyze, Model, and Visualize
        - Provide a clear connection between your business questions and how the data answers them
4) Communicate your business insights:
    - Create a Github repository to share your code and data wrangling/modeling techniques, with a technical audience in mind
    - Create a blog post to share your questions and insights with a non-technical audience
```

In [2]:
def read(path='/kaggle/input/10000-amazon-products-dataset/Amazon_Products.csv'):
    df = pd.read_csv(path, dtype=str)
    display(df.head())
    return df

# import sqlite3
# connection = sqlite3.connect("/kaggle/input/10000-amazon-products-dataset/Amazon_Products.csv")
# connection

In [3]:
df = read()
df.shape

Unnamed: 0,uniq_id,product_name,manufacturer,price,number_available_in_stock,number_of_reviews,number_of_answered_questions,average_review_rating,amazon_category_and_sub_category,description,...,Unnamed: 885,Unnamed: 886,Unnamed: 887,Unnamed: 888,Unnamed: 889,Unnamed: 890,Unnamed: 891,Unnamed: 892,Unnamed: 893,Unnamed: 894
0,eac7efa5dbd3d667f26eb3d3ab504464,Hornby 2014 Catalogue,Hornby,£3.42,5 new,15,1,4.9 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,Product Description Hornby 2014 Catalogue Box ...,...,,,,,,,,,,
1,b17540ef7e86e461d37f3ae58b7b72ac,FunkyBuys® Large Christmas Holiday Express Fes...,FunkyBuys,£16.99,,2,1,4.5 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,Size Name:Large FunkyBuys® Large Christmas Hol...,...,,,,,,,,,,
2,348f344247b0c1a935b1223072ef9d8a,CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...,ccf,£9.99,2 new,17,2,3.9 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,BIG CLASSIC TOY TRAIN SET TRACK CARRIAGE LIGHT...,...,,,,,,,,,,
3,e12b92dbb8eaee78b22965d2a9bbbd9f,HORNBY Coach R4410A BR Hawksworth Corridor 3rd,Hornby,£39.99,,1,2,5.0 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,Hornby 00 Gauge BR Hawksworth 3rd Class W 2107...,...,,,,,,,,,,
4,e33a9adeed5f36840ccc227db4682a36,Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...,Hornby,£32.19,,3,2,4.7 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,Product Description Hornby RailRoad 0-4-0 Gild...,...,,,,,,,,,,


(10004, 895)

## Data Cleaning

In [4]:
def ColSummary (col, dtype=None, new_col=None):
    try:
        print("\n"+str(col).upper()+" ANALYSIS")
        if new_col:
            df.rename(columns={col:new_col})
            print("\t"+col+" name updated to "+new_col)
            col = new_col
        if dtype:
            df[col] = df[col].astype(dtype)
            print("\tType changed to "+dtype)
        print('\tTotal number of values : ', len(df[col]))
        print('\tTotal number of unique values : ', df[col].nunique())
        print('\tUnique values : ',df[col].unique()[:5])
        print('\tRange of length : ', min(df[col].apply(lambda x: len(str(x))))," to ",max(df[col].apply(lambda x: len(str(x)))))
        print('\tMissing Values : ', df[col].isna().sum())
    except:
        print("\n*** Cannot work with this datatype.. Try Again.... ***")

In [5]:
# remove rows which have non-sensical ids
def DataCleaning(df):
    # delete columns which are unnamed
    unnamed_cols = [col for col in df.columns if 'Unnamed' in col]
    # print("Need to delete these cols: ", unnamed_cols)
    print("Deleting unnamed columns ....")
    df = df.drop(unnamed_cols, axis = 1)
    print("Leftover columns: ", df.columns)
    ColSummary('uniq_id', 'str')
    print('\n\tInspecting IDs with unusual length : ', list(df['uniq_id'][df['uniq_id'].apply(lambda x: len(str(x)))<=3]))
    print('\tDeleting IDs with unusual length ..... ')
    df = df.drop(index = list(df['uniq_id'][df['uniq_id'].apply(lambda x: len(str(x))<=3)].index))
    print('\nUnique IDs updated .....')
    ColSummary('uniq_id')
    ColSummary('product_name', 'string')
    ColSummary('manufacturer', 'string')
    # convert currency into float
    ColSummary('price')
    print("\n Deleting rows which have price with greater than 5 length i.e. have ranges instead of values .....")
    df = df.drop(index = list(df['price'][df['price'].apply(lambda x: len(str(x))>5)].index))
    df['price'] = df['price'][df['price'].isna()==False].str[1:]
    ColSummary('price', 'float64')
    # change to string by removing extra text
    ColSummary('number_available_in_stock')
    df['number_available_in_stock'] = df['number_available_in_stock'].apply(lambda x: str(x).split()[0])
    ColSummary('number_available_in_stock','float64')
    ColSummary('average_review_rating')
    ColSummary('number_of_reviews')
    print("\n Removing commas from strings for float conversion .....")
    df['number_of_reviews'] = df['number_of_reviews'].apply(lambda x: str(x).replace(',',''))
    ColSummary('number_of_reviews', 'float64')
    ColSummary('number_of_answered_questions','float64')
    df['average_review_rating'] = df['average_review_rating'].apply(lambda x: str(x).split()[0])
    ColSummary('average_review_rating')
    ColSummary('amazon_category_and_sub_category')
    df['amazon_category_and_sub_category'] = df['amazon_category_and_sub_category'].apply(lambda x: str(x).split(" > "))
    itr = max(df['amazon_category_and_sub_category'].apply(lambda x: len(x)))
    for i in range (1, itr+1):
        df['amazon_category_and_sub_category'].apply(lambda x: x.append("None") if (len(x) < i) else x)

    ColSummary('amazon_category_and_sub_category')
    ColSummary('description')
    ColSummary('product_information')
    ColSummary('product_description')
    ColSummary('items_customers_buy_after_viewing_this_item')
    ColSummary('customer_questions_and_answers')
    return df

In [6]:
df = DataCleaning(df)
df.shape

Deleting unnamed columns ....
Leftover columns:  Index(['uniq_id', 'product_name', 'manufacturer', 'price',
       'number_available_in_stock', 'number_of_reviews',
       'number_of_answered_questions', 'average_review_rating',
       'amazon_category_and_sub_category', 'description',
       'product_information', 'product_description',
       'items_customers_buy_after_viewing_this_item',
       'customer_questions_and_answers'],
      dtype='object')

UNIQ_ID ANALYSIS
	Type changed to str
	Total number of values :  10004
	Total number of unique values :  10002
	Unique values :  ['eac7efa5dbd3d667f26eb3d3ab504464' 'b17540ef7e86e461d37f3ae58b7b72ac'
 '348f344247b0c1a935b1223072ef9d8a' 'e12b92dbb8eaee78b22965d2a9bbbd9f'
 'e33a9adeed5f36840ccc227db4682a36']
	Range of length :  1  to  32
	Missing Values :  0

	Inspecting IDs with unusual length :  [nan, '}', nan, '}']
	Deleting IDs with unusual length ..... 

Unique IDs updated .....

UNIQ_ID ANALYSIS
	Total number of values :  10004
	To

(5603, 14)

## Question 1. 

In [7]:
str.replace?

## Question 2. 

## Question 3. Predict the amazon categories using description?

In [8]:
def TextClassifier(text_data,label):
    "function to create text classifier for a set of data and labels"
    text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
    text_clf = text_clf.fit(text_data, label)
    return text_clf

In [9]:
# defining the target variable

df['category'] = df['amazon_category_and_sub_category'].apply(lambda x: x[0])

categorytolabel = {}
i = 1
for item in df['category'].unique():
    categorytolabel[item] = i
    i = i+1

df['category_labels'] = df['category'].map(categorytolabel)

df = df.groupby('category').filter(lambda x : len(x)>30)
df = df.drop(index = list(df[df['category']=='nan'].index))

In [10]:
# running the model and comparing the predictions
text_df = df[['product_information','product_description']]

for col in text_df.columns:
    text_df[col] = text_df[col].fillna(value='None')
    model = TextClassifier(text_df[col],df['category_labels'])
    text_df[col+'_classification'] = model.predict(text_df[col])

text_df['product_information_classification'] = text_df['product_information_classification'].map({value : key for (key, value) in categorytolabel.items()})
text_df['product_description_classification'] = text_df['product_description_classification'].map({value : key for (key, value) in categorytolabel.items()})
df = df.merge(text_df[['product_information_classification','product_description_classification']], how = 'left', left_index = True, right_index = True)


from sklearn.metrics import classification_report
print("\n ********* REPORT 1 *********")
print(classification_report(df['category'], df['product_information_classification'] ))
print("\n ********* REPORT 2 *********")
print(classification_report(df['category'], df['product_description_classification'] ))


 ********* REPORT 1 *********
                           precision    recall  f1-score   support

            Arts & Crafts       0.90      0.98      0.94       652
      Baby & Toddler Toys       0.00      0.00      0.00        59
      Characters & Brands       0.52      0.17      0.26       503
  Die-Cast & Toy Vehicles       1.00      0.27      0.42       498
      Dolls & Accessories       0.99      0.59      0.74       254
              Fancy Dress       1.00      0.42      0.59       348
       Figures & Playsets       0.68      0.84      0.76       541
                    Games       0.49      0.97      0.65       611
                  Hobbies       0.52      0.85      0.64       654
        Jigsaws & Puzzles       0.00      0.00      0.00       157
           Party Supplies       0.72      0.99      0.84       662
Puppets & Puppet Theatres       0.00      0.00      0.00        71
    Sports Toys & Outdoor       1.00      0.01      0.02       174

                 accuracy    