### Importing The Library

#### I am using typical data science stack: numpy, pandas, sklearn, matplotlib, seaborn

In [None]:
# Reload modules
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#supress warnings
import warnings
warnings.filterwarnings('ignore')

#numpy and pandas for data manipulation
import pandas as pd
import numpy as np
from numpy import median
from scipy.stats import norm
import re
import math

#matplotlib and seaborn for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set(style='darkgrid')

import plotly
from plotly.offline import iplot
import plotly.graph_objects as go
from wordcloud import WordCloud

#file system management
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import preprocessing
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
import joblib

### Reading the The Dataset

In [None]:
train_data = pd.read_csv('../data/mercari_train.csv.gz', compression='gzip',
                   error_bad_lines=False)
test_data = pd.read_csv('../data/mercari_test.csv.gz', compression='gzip',
                   error_bad_lines=False)
# train_data.to_csv(r'../Solutions/train.csv', index = False)
# test_data.to_csv(r'../Solutions/test.csv', index = False)

For EDA purposes train_data will be used

In [None]:
df = train_data

In [None]:
df.shape

In [None]:
df.head()

## Exploratory Data Analysis

#### lets examine the missing values

In [None]:
df.info()

In [None]:
# percentage of data which is null in each column
(df.isna().sum() / df.shape[0]) * 100

The columns brand_name and item_description has missing values. I will have to fill in these missing values which is known as imputation. 31.38 % of brand_name values are missing.

In [None]:
df.describe(include='all')

Insights
1) Most of names are unique.
2) Top Category is Women Apparels.
3) Top Brand is PINK.
4) New is the most common product description.

### Duplicates rows

In [None]:
duplicaterows = df[df.duplicated(['name', 'item_condition_id', 'category_name', 'brand_name', 'shipping', 'price', 'item_description'])]

duplicaterows[:4]

In [None]:
df[(df.name == 'Bombshell') & (df.price == 6)]

In [None]:
df[(df.name == 'Nike slides') & (df.price == 12)]

In [None]:
df[(df.name == 'Victoria secret pink') & (df.price == 20)]

In [None]:
df[(df.name == 'bundle') & (df.price == 23)]

### Examine the Distribution of Target Variable - Price

In [None]:
df.price.describe()

In [None]:
plt.figure(figsize = (8,6))
plt.scatter(range(df.shape[0]), np.sort(df.price.values))
plt.xlabel('Index', fontsize = 12)
plt.ylabel('Price', fontsize = 12)
plt.show()

In [None]:
print('Number of products with price less than $3 -', df['price'][df['price'] < 3].count())
print('Number of products with price less than $4 -', df['price'][df['price'] < 4].count())
print('Number of products with price greater than $800 -', df['price'][df['price'] > 800].count())

There are some outliers in the data on the upper side. So Pricing of any product can be between 4 to 800 dollars. So I will remove 2026 rows from the data.

In [None]:
df = df[(df.price >= 4) & (df.price <= 800)]

In [None]:
df['price'].plot.hist(bins=50, figsize=(12,6), edgecolor='white', range=[0,500])
plt.title('Price Distribution', fontsize=12)
plt.xlabel('Price', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

Price distribution is positively skewed. Max values lie between $4 and \$100.We will take the logarithm to see if the Log(price) is normally distributed or not.

In [None]:
plt.figure(figsize=(12, 8))
sns.distplot(np.log1p(df['price']), bins=50, kde=True, fit=norm)
plt.title('Log(Price) Distribution', fontsize=12)
plt.xlabel('log(price+1)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

After taking logarithm of target variable Price, it appears to be almost almost normally distributed.

In [None]:
df.price.describe(percentiles = [0.8, 0.9, 0.95, 0.99])

* 95% of the items are priced below $65
* 99% of the items are priced below $125
* 1% of the products might be outliers or some expensive products
* Mean price is $23

In [None]:
#Create a new feature - log(price)
df['log_price'] = np.log1p(df['price'])

### Univariate and Multivariate Analysis
### Shipping

In [None]:
df['shipping'].value_counts()

Count of products with shipping equals 0 (or shipping fee paid by buyer) is greater

In [None]:
shipping_buyer = df[df['shipping'] == 0]['price']
shipping_seller = df[df['shipping'] == 1]['price']

plt.figure(figsize=(12, 8))
plt.hist(shipping_seller, alpha=0.5, bins=50, range=[0,100], label='seller')
plt.hist(shipping_buyer, alpha=0.5, bins=50, range=[0,100], label='buyer')
plt.xlabel('Price')
plt.legend(fontsize = 10)
plt.show()

In [None]:
print('Median price when shipping is paid by seller is ${}'.format(shipping_seller.median()))
print('Median price when shipping is paid by buyer is ${}'.format(shipping_buyer.median()))

In [None]:
plt.figure(figsize=(10, 8))
sns.boxplot(x='shipping', y='log_price', data=df)
plt.title('Effect of shipping on log(price)')
plt.show()

* Not able to explain the condition derived above.
* But it should be like the price of products should be a bit more when the shipping is paid by buyer than seller. It should explain that buyer pay more shipping fee and the total product cost goes up. But here when shipping is paid by seller the price is high. So cannot draw any insights from this.

### Brand

In [None]:
df.brand_name.isnull().sum()

In [None]:
df.brand_name.value_counts()[0:11]

In [None]:
df['has_brand_name'] = (df['brand_name'].isna()).astype(np.int8) #if brand_name is present, 0 else 1

In [None]:
true_count = df['has_brand_name'].sum()
true_count

In [None]:
plt.figure(figsize=(12, 10))
sns.boxplot(x='has_brand_name', y='log_price', data=df)
plt.title('Effect of shipping on log_price')
plt.show()

* Median price of products with missing brand names is lower than products having brand name

### Expensive Brands

In [None]:
data = df.where(df['price'] > 100).sort_values(by=['price'], ascending=False)

brands = data.groupby('brand_name')['price'].agg(['count', 'mean']).sort_values(by=['count'], ascending=False).reset_index()

expensive_brands = brands[:20]

In [None]:
plt.figure(figsize=(20, 15))

plt.barh(range(0,len(expensive_brands)), expensive_brands['mean'], align='center', alpha=0.5, color='r')

plt.yticks(range(0,len(expensive_brands)), expensive_brands['brand_name'])
plt.xticks()
plt.title('Mean price of 20 expensive brands')
plt.xlabel('Price')
plt.ylabel('Brands')
plt.show()

* These brands have products which are expensive (or over $100)

### Luxurious Brands

In [None]:
data = df.where(df['price'] > 500).sort_values(by=['price'], ascending=False)

brands = data.groupby('brand_name')['price'].agg(['count', 'mean']).sort_values(by=['count'], ascending=False).reset_index()

luxurious_brands = brands[:20]

In [None]:
plt.figure(figsize=(20, 15))

plt.barh(range(0,len(luxurious_brands)), luxurious_brands['mean'], align='center', alpha=0.5, color='b')

plt.yticks(range(0,len(luxurious_brands)), luxurious_brands['brand_name'])
plt.xticks()
plt.title('20 luxurious brands')
plt.xlabel('Price')
plt.ylabel('Brands')
plt.show()

* These brands have products which are luxurious (or over $500)

### Cheap Brands

In [None]:
data = df.where(df['price'] < 50).sort_values(by=['price'], ascending=False)

brands = data.groupby('brand_name')['price'].agg(['count', 'mean']).sort_values(by=['count'], ascending=False).reset_index()

cheap_brands = brands[:10]

In [None]:
cheap_brands

* We have got the expensive, luxurious and cheap brands. But the brands in these categories can have products under $100 but they also have products which are expensive or luxurious.
* There are 1083 unique brands in the training set
* Pink and LuLaRoe are the brands those products are mostly sold.
* There are 20448 products that do not have brand name present.
* Amongst the top selling 10 brands on mercari, it seems that products of these brands are not expensive.

### Product Category

In [None]:
df.category_name.isna().sum()

In [None]:
df.category_name.describe()

In [None]:
df.category_name.value_counts()[0:10]

* There are 2 sub-categories under the category_name

In [None]:
def process_category(input_data):
    """
    Split the category_name into 3 parts as category_0, category_1 and category_2
    """
    for i in range(3):
        
        def get_categories(ele):
            
            if type(ele) != str:
                return np.nan
        
            cat = ele.split('/')
            
            if i >= len(cat):
                return np.nan
            else:
                return cat[i]

        col_name = 'category_' + str(i)
        
        input_data[col_name] = input_data['category_name'].apply(get_categories)
        
        input_data.fillna({'category_name': 'Other'}, inplace = True)
    
    return input_data

In [None]:
# #splitting category_name into category_0, category_1 and category_2
df = process_category(df)

In [None]:
print('There are {} unique values in category_0'.format(len(df['category_0'].unique())))
print('There are {} unique values in category_1'.format(len(df['category_1'].unique())))
print('There are {} unique values in category_2'.format(len(df['category_2'].unique())))

### Main Category

In [None]:
plt.figure(figsize=(18,12))
sns.boxplot(x = 'log_price', y = 'category_0', data = df, orient = 'h')
plt.title('Boxplot of categories and prices', fontsize=14)
plt.xlabel('Log(Price)', fontsize=14)
plt.ylabel('Categories', fontsize=14)
plt.show()

* Nothing available to compare as the first category only has Women tag

### First Subcategory

In [None]:
cat_mean = df.groupby('category_1')['price'].agg(['mean']).sort_values(by=['mean'], ascending = False).reset_index()[:20]

In [None]:
plt.figure(figsize=(10, 14))

plt.barh(range(0,len(cat_mean)), cat_mean['mean'], align='center', alpha=0.5, color='b')

plt.yticks(range(0,len(cat_mean)), cat_mean['category_1'])
plt.xticks()
plt.title('Mean price of items grouped by 20 first sub-categories')
plt.xlabel('Price')
plt.ylabel('Categories')
plt.show()

* Womens Handbag have highest mean price.
* Underwear has lowest mean price.

### Second Subcategory

In [None]:
sub_cat_mean = df.groupby('category_2')['price'].agg(['mean']).sort_values(by=['mean'], ascending = False).reset_index()[:20]

In [None]:
plt.figure(figsize=(10, 14))

plt.barh(range(0,len(sub_cat_mean)), sub_cat_mean['mean'], align='center', alpha=0.5, color='g')

plt.yticks(range(0,len(sub_cat_mean)), sub_cat_mean['category_2'])
plt.xticks()
plt.title('Mean price of items grouped by 20 second sub-categories')
plt.xlabel('Price')
plt.ylabel('Categories')
plt.show()

* Shoulder bags have highest mean price.
* Bra's have lowest mean price in third category.

* Women/Athletic Apparel/Pants, Tights, Leggings has the highest number of products on the marketplace.
* There are 23 unique categories.
* There are 0 missing values.
* I have cleaned data in category_name feature. I have removed slashes and made them into 3 categories or 3 columns for category and its sub-categories. This is done so that while modelling these subcategory can be used to predict the price as they are very improtant information.

### Item Condition

In [None]:
plt.figure(figsize=(10, 6))
sns.stripplot(df['item_condition_id'], df['price'], palette = 'Reds')
plt.title("Item Condition vs Price")
plt.show()

In [None]:
df['item_condition_id'].value_counts(normalize = True) * 100

In [None]:
fig, ax = plt.subplots(figsize=(12, 10))
sns.boxplot(x='item_condition_id', y='log_price', data=df)
plt.show()

In [None]:
df.groupby('item_condition_id')['price'].describe()

* As per assumption, most sellers tend to give item_condition_id as 2,3 as buyers don't buy products which are not in great condition.
* But there are many products that are in great condition as significant sellers have given item_condition_id = 1 to the products, and as expected their mean price is also higher because if the product is in great condition its price should be high.


* Now I plan to do Text analysis of the column containing text, I tried roughly without them and tried predicting the price , but it came to be a poor prediction, Hence I am thinking to use Language processing (NLP) to make some sense out of the data present in text form.

### Text Analysis

### Product Name

In [None]:
print(df.name.values[10])
print('-'*50)
print(df.name.values[50])
print('-'*50)
print(df.name.values[500])
print('-'*50)
print(df.name.values[1000])
print('-'*50)
print(df.name.values[10000])

In [None]:
print('Unique product names in the dataset are {:.2f}%'.format(df['name'].str.lower().str.strip().nunique() / df.shape[0] * 100))

In [None]:
from wordcloud import WordCloud

cloud = WordCloud(width=3000, height=2000).generate(' '.join(df.name.astype(str)))
plt.figure(figsize=(20,15))
plt.imshow(cloud)
plt.axis('off')

Top words seems to be brand names such as Victoria Secret, American Eagle,which are brand names as people tends to keep them in the product title. Same can be done with product description.

In [None]:
df['len_name'] = df['name'].apply(lambda x: len(x))

df.groupby('len_name')['price'].agg(['mean', 'median'])[0:10]

I cannot derive any insights from here, like any relation between the word length and average price.

### Preprocessing Data

In [None]:
#stopwords without no, not, etc
STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [None]:
def remove_emoji(sentence):
    """
    Remove emojis from the string
    """
    pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return pattern.sub(r'', sentence)

In [None]:
def decontracted(phrase):
    """
    Expand and create common english contractions in the text
    """
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase

In [None]:
def process_text(input_data, cols):
    """
    Take the text columns and process the data. Expand contractions, use regex to remove symbols/numbers, remove emojis, punctuations
    and stopwords and convert text to lowercase
    """
    for col in cols:
        
        processed_data = []
        
        for sent in input_data[col].values:
            
            sent = decontracted(sent)
            sent = sent.replace('\\r', ' ')
            sent = sent.replace('\\"', ' ')
            sent = sent.replace('\\n', ' ')
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            sent = remove_emoji(sent)
            sent = ' '.join(e for e in sent.split() if e not in STOPWORDS)
            processed_data.append(sent.lower().strip())
            
        input_data[col] = processed_data
        
    return input_data

In [None]:
#nlp features
def get_text_features(input_data):
    """
    NLP features derived from the text columns
    """
    input_data['has_brand_name'] = (input_data['brand_name'].isnull()).astype(np.int8) #if brand_name is present, 1 else 0
    
    input_data['has_price'] = np.where(input_data['item_description'].str.contains(' rm ', na = False), 1, 0) #if item_description has [rm] which is price string removed, 1 else 0

    input_data['reversed_item_condition_id'] = 5 - input_data['item_condition_id']

    input_data['is_luxurious'] = (input_data['brand_name'].isin(luxurious_brands['brand_name'])).astype(np.int8)

    input_data['is_expensive'] = (input_data['brand_name'].isin(expensive_brands['brand_name'])).astype(np.int8)

    input_data['is_cheap'] = (input_data['brand_name'].isin(cheap_brands['brand_name'])).astype(np.int8)

    input_data['len_name'] = input_data['name'].str.len()
    input_data['len_item_description'] = input_data['item_description'].str.len()
    input_data['len'] = input_data['len_name'] + input_data['len_item_description']

    input_data['token_count_name'] = input_data['name'].apply(lambda x: len(x.split(' ')))
    input_data['token_count_item_description'] = input_data['item_description'].apply(lambda x: len(x.split(' ')))
    input_data['token_count'] = input_data['token_count_name'] + input_data['token_count_item_description']
    input_data['token_count_ratio'] = input_data['token_count_name']/input_data['token_count_item_description']

    input_data["name_words"] = input_data["name"].str.count("(\s|^)[a-z]+(\s|$)")
    input_data["item_description_words"] = input_data["item_description"].str.count("(\s|^)[a-z]+(\s|$)")
    input_data["words"] = input_data["name_words"] + input_data["item_description_words"]

    input_data["name_numbers"] = input_data["name"].str.count("(\s|^)[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?(\s|$)")
    input_data["item_description_numbers"] = input_data["item_description"].str.count("(\s|^)[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?(\s|$)")
    input_data["numbers"] = input_data["name_numbers"] + input_data["item_description_numbers"]

    input_data["name_letters"] = input_data["name"].str.count("[a-zA-Z]")
    input_data["item_description_letters"] = input_data["item_description"].str.count("[a-zA-Z]")
    input_data["letters"] = input_data["name_letters"] + input_data["item_description_letters"]

    input_data["name_digits"] = input_data["name"].str.count("[0-9]")
    input_data["item_description_digits"] = input_data["item_description"].str.count("[0-9]")
    input_data["digits"] = input_data["name_digits"] + input_data["item_description_digits"]

    return input_data

In [None]:
def handle_missing_values(input_data):
    """
    Fills the nan/missing values with 'missing' for text columns
    """
    input_data.fillna({'name': 'missing', 'item_description': 'missing'}, inplace=True)
    
    return input_data

In [None]:
def preprocess(input_data):
    """
    Process the data by handling missing values, process category_name, process text
    """
    input_data = input_data[(input_data['price'] >= 3) & (input_data['price'] <= 800)]
    
    input_data['price'] = np.log1p(input_data['price'])

    input_data = handle_missing_values(input_data)
    
    input_data = process_category(input_data)
    
    input_data = process_text(input_data, ['name', 'item_description', 'category_name'])

    return input_data

In [None]:
data = preprocess(df)
data.fillna({'category_0': 'other', 'category_1': 'other', 'category_2': 'other'}, inplace = True)

### Brand Names in Product Name

In [None]:
brands = df['brand_name'].unique().tolist()
name = df['name'].tolist()
name_list = [i.strip(',').split(' ') for i in name]
name_corpus = [item for name in name_list for item in name]

In [None]:
brands = set(brands)
brands_in_name = list(brands.intersection(name_corpus))

In [None]:
cloud = WordCloud(width=3000, height=2000).generate(' '.join(brands_in_name))
plt.figure(figsize=(20,15))
plt.imshow(cloud)
plt.axis('off')

In [None]:
#The temp object here is a pandas.series object which does not have a iplot method when not linked to plotly. 
#We need cufflinks to link plotly to pandas and add the iplot method.

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
def get_words(corpus, n = 20):
    vect = CountVectorizer().fit(corpus)
    bow = vect.transform(corpus)
    sum_words = bow.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
def get_bigrams(corpus, n = 20):
    vect = CountVectorizer(ngram_range=(2,2)).fit(corpus)
    bow = vect.transform(corpus)
    sum_words = bow.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
common_words = get_words(data['name'], 10)
df_words = pd.DataFrame(common_words, columns = ['name' , 'count'])
df_words.groupby('name').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    title='Top 10 words in product name after preprocessing data')

* lularose and pink are used in product name mostly which are brand name.

In [None]:
common_words = get_bigrams(data['name'], 10)

df_words = pd.DataFrame(common_words, columns = ['name' , 'count'])

df_words.groupby('name').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    title='Top 10 bi-grams in product name after preprocessing data')

### Product Description

In [None]:
print(df.item_description.values[101])
print('-'*50)
print(df.item_description.values[60])
print('-'*50)
print(df.item_description.values[590])
print('-'*50)
print(df.item_description.values[1020])
print('-'*50)
print(df.item_description.values[10800])

In [None]:
cloud = WordCloud(width=3000, height=2000).generate(' '.join(df.item_description.astype(str)))
plt.figure(figsize=(20,15))
plt.imshow(cloud)
plt.axis('off')

### Brand Names in Item Description

In [None]:
brands = df['brand_name'].unique().tolist()
name = df['item_description'].tolist()
df['item_description']

In [None]:
brands = set(brands)
brands_in_name = list(brands.intersection(name_corpus))

In [None]:
cloud = WordCloud(width=3000, height=2000).generate(' '.join(brands_in_name))
plt.figure(figsize=(20,15))
plt.imshow(cloud)
plt.axis('off')

In [None]:
common_words = get_words(data['item_description'], 10)
df_words = pd.DataFrame(common_words, columns = ['text' , 'count'])

df_words.groupby('text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    title='Top 10 words in product description after preprocessing data')

In [None]:
common_words = get_bigrams(data['item_description'], 10)
df_words = pd.DataFrame(common_words, columns = ['text' , 'count'])

df_words.groupby('text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    title='Top 10 bi-grams in product description after preprocessing data')

### Final dataset prepared for training

In [None]:
#NLP features
data = get_text_features(data)

data.fillna({'brand_name': ' '}, inplace = True)

#concatenate text features
data['name'] = data['name'] + ' ' + data['brand_name'] + ' ' + data['category_name']
data['text'] = data['name'] + ' ' + data['item_description']

data = data.drop(columns = ['brand_name', 'item_description', 'category_name'], axis = 1)

### Correlation matrices

In [None]:
df1 = data[['price', 'item_condition_id',
       'shipping', 'category_0', 'category_1',
       'category_2', 'has_brand_name', 'has_price',
       'reversed_item_condition_id', 'is_luxurious', 
        'is_expensive']]

In [None]:
corrMatrix  = df1.corr()
plt.figure(figsize = (18,9))
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
df2 = data[['price', 'is_cheap', 'len_name', 'len_item_description', 'len',
       'token_count_name', 'token_count_item_description', 'token_count',
       'token_count_ratio', 'name_words', 'item_description_words', 'words',
       'name_numbers', 'item_description_numbers', 'numbers', 'name_letters']]

In [None]:
corrMatrix  = df2.corr()
plt.figure(figsize = (18,9))
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
df3 = data[['price', 'item_description_letters', 'letters', 'name_digits',
       'item_description_digits', 'digits']]

In [None]:
corrMatrix  = df3.corr()
plt.figure(figsize = (18,9))
sns.heatmap(corrMatrix, annot=True)
plt.show()

### Conclusions/Observations from Correlations

* I have found strong correlation between is_luxurious. is_expensive and prices so I will use these columns separtely while modelling as they can lead to price prediction results.
* So is_expensive, is_luxurious seems to have high predictive power.
* Other NLP features seems to be uncorrelated

#### Machine leaning (Modelling) - Present in model.ipynb

* Ridge regression with best hyperparameters takes very less time to train and rmsle is also less than 0.5, so I choose ridge_model to predict the outcome of test_data or mercari_test